| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800 |
- /* Copyright 2019-2020 Canaan Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #pragma once
- #include "../kernel_utils.h"
- #include <cmath>
- #include <runtime/nnil.h>
- #include <runtime/runtime_op_utility.h>
- #include <xtl/xspan.hpp>
- #ifdef __riscv
- #include "../riscv/neutral_kernels.h"
- #endif
- namespace nncase
- {
- namespace kernels
- {
- namespace neutral
- {
- template <class TOp>
- void binary(const float *input_a, const float *input_b, float *output, const runtime_shape_t &in_a_shape,
- const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, const value_range<float> &fused_activation, TOp &&op)
- {
- // opt. no broadcast
- if (in_a_shape == in_b_shape)
- {
- auto size = kernels::details::compute_size(in_a_shape);
- for (size_t i = 0; i < size; i++)
- {
- const auto a = input_a[i];
- const auto b = input_b[i];
- output[i] = kernels::details::apply_activation(op(a, b), fused_activation);
- }
- }
- // fallback
- else
- {
- for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
- {
- for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
- {
- for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
- {
- for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
- {
- runtime_shape_t in_off = { d0, d1, d2, d3 };
- const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
- const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
- const auto a = input_a[offset(in_a_shape, in_a_off)];
- const auto b = input_b[offset(in_b_shape, in_b_off)];
- output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation);
- }
- }
- }
- }
- }
- }
- template <class TOp>
- void quantized_binary(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const runtime_shape_t &in_a_shape,
- const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, int32_t input_a_offset, int32_t input_a_mul, int32_t input_a_shift,
- int32_t input_b_offset, int32_t input_b_mul, int32_t input_b_shift, int32_t output_mul, int32_t output_shift, int32_t output_offset, TOp &&op)
- {
- // opt. no broadcast
- if (in_a_shape == in_b_shape)
- {
- auto size = kernels::details::compute_size(in_a_shape);
- for (size_t i = 0; i < size; i++)
- {
- auto a = (int32_t)input_a[i];
- auto b = (int32_t)input_b[i];
- a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
- b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
- auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
- output[i] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
- }
- }
- // fallback
- else
- {
- for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
- {
- for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
- {
- for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
- {
- for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
- {
- runtime_shape_t in_off = { d0, d1, d2, d3 };
- const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
- const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
- auto a = (int32_t)input_a[offset(in_a_shape, in_a_off)];
- auto b = (int32_t)input_b[offset(in_b_shape, in_b_off)];
- a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
- b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
- auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
- output[offset(out_shape, in_off)] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
- }
- }
- }
- }
- }
- }
- template <class TRange, class TPtrGetter = details::default_ptr_getter<uint8_t, TRange>>
- inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
- {
- for (size_t oc = 0; oc < outer_size; oc++)
- {
- for (size_t i = 0; i < inputs.size(); i++)
- {
- auto size = inner_size * concat_dims[i];
- auto src = getter(inputs[i]) + oc * size;
- std::copy(src, src + size, output);
- output += size;
- }
- }
- }
- inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
- int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
- const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
- {
- const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
- const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
- const auto g_ic = in_shape[1] / groups;
- const auto g_oc = out_channels / groups;
- for (int32_t batch = 0; batch < in_shape[0]; batch++)
- {
- const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
- for (int32_t og = 0; og < groups; og++)
- {
- const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
- const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
- for (int32_t oc = 0; oc < g_oc; oc++)
- {
- const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
- for (int32_t oy = 0; oy < out_h; oy++)
- {
- for (int32_t ox = 0; ox < out_w; ox++)
- {
- const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
- const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
- const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
- const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
- float value = bias[og * g_oc + oc];
- for (int32_t ic = 0; ic < g_ic; ic++)
- {
- const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
- const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
- for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
- {
- for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
- {
- const int32_t in_y = in_y_origin + dilation_h * ky;
- const int32_t in_x = in_x_origin + dilation_w * kx;
- const float in_v = in_c_p[in_y * in_shape[3] + in_x];
- const float w = w_ic_p[ky * filter_w + kx];
- value += in_v * w;
- }
- }
- }
- *output++ = details::apply_activation(value, fused_activation);
- }
- }
- }
- }
- }
- }
- inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, int32_t input_offset, int32_t filter_offset,
- int32_t output_mul, int32_t output_shift, int32_t output_offset, const runtime_shape_t &in_shape, int32_t groups, int32_t out_channels,
- int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
- const padding &padding_h, const padding &padding_w)
- {
- const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
- const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
- const auto g_ic = in_shape[1] / groups;
- const auto g_oc = out_channels / groups;
- for (int32_t batch = 0; batch < in_shape[0]; batch++)
- {
- const uint8_t *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
- for (int32_t og = 0; og < groups; og++)
- {
- const uint8_t *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
- const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
- for (int32_t oc = 0; oc < g_oc; oc++)
- {
- const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
- for (int32_t oy = 0; oy < out_h; oy++)
- {
- for (int32_t ox = 0; ox < out_w; ox++)
- {
- const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
- const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
- const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
- const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
- int32_t value = bias[og * g_oc + oc];
- for (int32_t ic = 0; ic < g_ic; ic++)
- {
- const uint8_t *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
- const uint8_t *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
- for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
- {
- for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
- {
- const int32_t in_y = in_y_origin + dilation_h * ky;
- const int32_t in_x = in_x_origin + dilation_w * kx;
- const int32_t in_v = (int32_t)in_c_p[in_y * in_shape[3] + in_x] + input_offset;
- const int32_t w = (int32_t)w_ic_p[ky * filter_w + kx] + filter_offset;
- value += in_v * w;
- }
- }
- }
- auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
- output_val += output_offset;
- *output++ = (uint8_t)std::clamp(output_val, 0, 255);
- }
- }
- }
- }
- }
- }
- inline void conv2d_transpose(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
- int32_t groups, const runtime_shape_t &out_shape, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
- const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
- {
- std::fill(output, output + kernels::details::compute_size(out_shape), 0.f);
- const auto g_ic = in_shape[1] / groups;
- const auto g_oc = out_shape[1] / groups;
- for (int32_t batch = 0; batch < in_shape[0]; batch++)
- {
- float *out_batch_p = output + (size_t)batch * out_shape[1] * out_shape[2] * out_shape[3];
- for (int32_t g = 0; g < groups; g++)
- {
- float *out_group_p = out_batch_p + (size_t)g * g_oc * out_shape[2] * out_shape[3];
- const float *w_group_p = weights + (size_t)g * g_oc * g_ic * filter_h * filter_w;
- for (int32_t ic = 0; ic < g_ic; ic++)
- {
- for (int32_t iy = 0; iy < in_shape[2]; iy++)
- {
- for (int32_t ix = 0; ix < in_shape[3]; ix++)
- {
- const int32_t out_y_origin = (iy * stride_h) - padding_h.before;
- const int32_t out_x_origin = (ix * stride_w) - padding_w.before;
- const int32_t filter_y_start = std::max(0, (-out_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_y_end = std::min(filter_h, (out_shape[2] - out_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_x_start = std::max(0, (-out_x_origin + dilation_w - 1) / dilation_w);
- const int32_t filter_x_end = std::min(filter_w, (out_shape[3] - out_x_origin + dilation_w - 1) / dilation_w);
- const float in_v = *input++;
- for (int32_t oc = 0; oc < g_oc; oc++)
- {
- float value = bias[g * g_oc + oc];
- float *out_c_p = out_group_p + (size_t)oc * out_shape[2] * out_shape[3];
- const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
- const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
- for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
- {
- for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
- {
- const int32_t out_y = out_y_origin + dilation_h * ky;
- const int32_t out_x = out_x_origin + dilation_w * kx;
- const float w = w_ic_p[ky * filter_w + kx];
- out_c_p[out_y * out_shape[3] + out_x] += in_v * w;
- }
- }
- }
- }
- }
- }
- }
- }
- if (fused_activation != value_range<float>::full())
- {
- for (size_t i = 0; i < kernels::details::compute_size(out_shape); i++)
- output[i] = details::apply_activation(output[i], fused_activation);
- }
- }
- template <class TQ>
- void dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t ¶m)
- {
- #if __riscv
- riscv_dequantize(input, output, count, param);
- #else
- float div = 1.f / param.scale;
- for (size_t i = 0; i < count; i++)
- {
- output[i] = (input[i] - param.zero_point) * div;
- }
- #endif
- }
- inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
- {
- for (size_t oy = 0; oy < a_rows; oy++)
- {
- for (size_t ox = 0; ox < b_cols; ox++)
- {
- float value = bias[ox];
- for (size_t i = 0; i < a_cols; i++)
- {
- const auto a = input_a[oy * a_cols + i];
- const auto b = input_b[i * b_cols + ox];
- value += a * b;
- }
- output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
- }
- }
- }
- inline void quantized_matmul(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const int32_t *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, int32_t input_a_offset, int32_t input_b_offset,
- int32_t output_mul, int32_t output_shift, int32_t output_offset)
- {
- for (size_t oy = 0; oy < a_rows; oy++)
- {
- for (size_t ox = 0; ox < b_cols; ox++)
- {
- int32_t value = bias[ox];
- for (size_t i = 0; i < a_cols; i++)
- {
- const auto a = (int32_t)input_a[oy * a_cols + i] + input_a_offset;
- const auto b = (int32_t)input_b[i * b_cols + ox] + input_b_offset;
- value += a * b;
- }
- auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
- output_val += output_offset;
- output[oy * b_cols + ox] = (uint8_t)std::clamp(output_val, 0, 255);
- }
- }
- }
- template <class T>
- void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value)
- {
- runtime_shape_t out_shape = { in_shape[0] + paddings[0].sum(),
- in_shape[1] + paddings[1].sum(),
- in_shape[2] + paddings[2].sum(),
- in_shape[3] + paddings[3].sum() };
- for (int d0 = 0; d0 < out_shape[0]; d0++)
- {
- auto d0_origin = -paddings[0].before;
- auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
- for (int d1 = 0; d1 < out_shape[1]; d1++)
- {
- auto d1_origin = -paddings[1].before;
- auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
- for (int d2 = 0; d2 < out_shape[2]; d2++)
- {
- auto d2_origin = -paddings[2].before;
- auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
- for (int d3 = 0; d3 < out_shape[3]; d3++)
- {
- auto d3_origin = -paddings[3].before;
- if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
- || d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
- || d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
- || d3 < paddings[3].before || d3 >= out_shape[3] - paddings[3].after)
- *output++ = pad_value;
- else
- *output++ = in2[d3_origin + d3];
- }
- }
- }
- }
- }
- template <class TQ>
- void quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t ¶m)
- {
- #if __riscv
- riscv_quantize(input, output, count, param);
- #else
- for (size_t i = 0; i < count; i++)
- {
- int32_t tmp = (int32_t)roundf(input[i] * param.scale + param.zero_point);
- output[i] = std::clamp(tmp, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
- }
- #endif
- }
- template <class TReducer>
- void reduce(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, const runtime_shape_t &reduced_shape, TReducer &&reducer)
- {
- std::fill(output, output + kernels::details::compute_size(reduced_shape), init_value);
- for (int32_t d0 = 0; d0 < in_shape[0]; d0++)
- {
- for (int32_t d1 = 0; d1 < in_shape[1]; d1++)
- {
- for (int32_t d2 = 0; d2 < in_shape[2]; d2++)
- {
- for (int32_t d3 = 0; d3 < in_shape[3]; d3++)
- {
- runtime_shape_t in_off = { d0, d1, d2, d3 };
- auto out_off = kernels::details::get_reduced_offset(in_off, reduced_shape);
- const auto a = input[offset(in_shape, in_off)];
- auto &b = output[offset(reduced_shape, out_off)];
- b = reducer(b, a);
- }
- }
- }
- }
- }
- template <class TOp>
- void unary(const float *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, TOp &&op)
- {
- for (size_t i = 0; i < count; i++)
- output[i] = op(input[i]);
- }
- template <class TBinaryOp, class TOutputOp>
- void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, int32_t filter_h, int32_t filter_w,
- int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
- const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
- {
- const auto out_h = kernels::details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
- const auto out_w = kernels::details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
- runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
- for (int32_t batch = 0; batch < in_shape[0]; batch++)
- {
- for (int32_t oc = 0; oc < in_shape[1]; oc++)
- {
- for (int32_t oy = 0; oy < out_h; oy++)
- {
- for (int32_t ox = 0; ox < out_w; ox++)
- {
- const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
- const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
- const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
- const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
- const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
- float value = init_value;
- int32_t kernel_count = 0;
- for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
- {
- for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
- {
- const int32_t in_y = in_y_origin + dilation_h * ky;
- const int32_t in_x = in_x_origin + dilation_w * kx;
- const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
- value = binary_op(value, in_v);
- kernel_count++;
- }
- }
- output[offset(out_shape, { batch, oc, oy, ox })] = kernels::details::apply_activation(window_op(value, kernel_count), fused_activation);
- }
- }
- }
- }
- }
- template <class T>
- void resize_nearest_neighbor(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w)
- {
- auto height_scale = (float)in_shape[2] / out_h;
- auto width_scale = (float)in_shape[3] / out_w;
- for (int batch = 0; batch < in_shape[0]; batch++)
- {
- auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
- for (int oc = 0; oc < in_shape[1]; oc++)
- {
- auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
- for (int oy = 0; oy < out_h; oy++)
- {
- auto in_y = std::min((int32_t)floorf(oy * height_scale), in_shape[2] - 1);
- auto in_row = in_c + in_y * in_shape[3];
- for (int ox = 0; ox < out_w; ox++)
- {
- auto in_x = std::min((int32_t)floorf(ox * width_scale), in_shape[3] - 1);
- *output++ = in_row[in_x];
- }
- }
- }
- }
- }
- template <class T>
- inline void resize_bilinear(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
- {
- auto height_scale = (float)in_shape[2] / out_h;
- auto width_scale = (float)in_shape[3] / out_w;
- if (align_corners && out_h > 1)
- height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
- if (align_corners && out_w > 1)
- width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
- auto destIdx = 0;
- for (int batch = 0; batch < in_shape[0]; batch++)
- {
- auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
- for (int oc = 0; oc < in_shape[1]; oc++)
- {
- auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
- for (int oy = 0; oy < out_h; oy++)
- {
- auto in_y = oy * height_scale;
- auto in_y0 = (int)floorf(in_y);
- auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
- for (int ox = 0; ox < out_w; ox++)
- {
- auto in_x = ox * width_scale;
- auto in_x0 = (int)floorf(in_x);
- auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
- auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
- auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
- auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
- auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
- auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
- auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
- auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
- auto a3 = (in_y - in_y0) * (in_x - in_x0);
- output[destIdx++] = T(v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3);
- }
- }
- }
- }
- }
- inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
- {
- for (size_t batch = 0; batch < outer_size; batch++)
- {
- auto src = input + batch * inner_size;
- auto dest = output + batch * inner_size;
- auto max = *std::max_element(src, src + inner_size);
- float sum = 0;
- for (size_t i = 0; i < inner_size; i++)
- {
- auto value = expf((src[i] - max) * beta);
- sum += value;
- dest[i] = value;
- }
- for (size_t i = 0; i < inner_size; i++)
- dest[i] /= sum;
- }
- }
- template <class T>
- void transpose(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const runtime_shape_t &in_shape, const runtime_shape_t &perm)
- {
- runtime_shape_t out_shape;
- for (size_t i = 0; i < 4; i++)
- out_shape[i] = in_shape[perm[i]];
- runtime_shape_t i, o;
- for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
- {
- i[perm[3]] = o[3];
- for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
- {
- i[perm[2]] = o[2];
- for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
- {
- i[perm[1]] = o[1];
- for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
- {
- i[perm[0]] = o[0];
- output[offset(out_shape, o)] = input[offset(in_shape, i)];
- }
- }
- }
- }
- }
- template <class T>
- void strided_slice(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const runtime_shape_t &in_shape, const runtime_shape_t &begin, const runtime_shape_t &end, const runtime_shape_t &strides)
- {
- auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
- return stride > 0 ? i < stop : i > stop;
- };
- for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
- {
- auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
- for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
- {
- auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
- for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
- {
- auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
- for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
- *output++ = d2_origin[d3];
- }
- }
- }
- }
- inline void nnil_unary_method(const float *input, float *output, size_t count, xtl::span<const uint8_t> body)
- {
- using namespace nncase::runtime;
- for (size_t i = 0; i < count; i++)
- {
- nnil_evalstack stack;
- span_reader sr(body);
- nnil_reader reader(sr);
- bool ret = false;
- while (reader.avail() && !ret)
- {
- auto op = reader.next();
- switch (op.opcode)
- {
- case nnil_nop:
- break;
- case nnil_dup:
- stack.dup();
- break;
- case nnil_pop:
- stack.pop();
- break;
- case nnil_lda_0:
- stack.push(input[i]);
- break;
- case nnil_ldc_r4_0:
- stack.push(0.f);
- break;
- case nnil_ldc_r4_1:
- stack.push(1.f);
- break;
- case nnil_ldc_r4:
- stack.push(op.ldc_r4.r4);
- break;
- case nnil_abs:
- stack.push(fabsf(stack.pop()));
- break;
- case nnil_ceil:
- stack.push(ceilf(stack.pop()));
- break;
- case nnil_cos:
- stack.push(cosf(stack.pop()));
- break;
- case nnil_exp:
- stack.push(expf(stack.pop()));
- break;
- case nnil_floor:
- stack.push(floorf(stack.pop()));
- break;
- case nnil_log:
- stack.push(logf(stack.pop()));
- break;
- case nnil_neg:
- stack.push(-stack.pop());
- break;
- case nnil_rsqrt:
- stack.push(1.f / sqrtf(stack.pop()));
- break;
- case nnil_sin:
- stack.push(sinf(stack.pop()));
- break;
- case nnil_square:
- {
- auto v = stack.pop();
- stack.push(v * v);
- break;
- }
- case nnil_add:
- {
- auto b = stack.pop();
- auto a = stack.pop();
- stack.push(a + b);
- break;
- }
- case nnil_sub:
- {
- auto b = stack.pop();
- auto a = stack.pop();
- stack.push(a - b);
- break;
- }
- case nnil_mul:
- {
- auto b = stack.pop();
- auto a = stack.pop();
- stack.push(a * b);
- break;
- }
- case nnil_div:
- {
- auto b = stack.pop();
- auto a = stack.pop();
- stack.push(a / b);
- break;
- }
- case nnil_min:
- {
- auto b = stack.pop();
- auto a = stack.pop();
- stack.push(std::min(a, b));
- break;
- }
- case nnil_max:
- {
- auto b = stack.pop();
- auto a = stack.pop();
- stack.push(std::max(a, b));
- break;
- }
- case nnil_clamp:
- {
- auto high = stack.pop();
- auto low = stack.pop();
- auto v = stack.pop();
- stack.push(std::clamp(v, low, high));
- break;
- }
- case nnil_ret:
- output[i] = stack.pop();
- ret = true;
- break;
- default:
- NNCASE_THROW(std::runtime_error, "Invalid nnil op");
- break;
- }
- }
- }
- }
- inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTRICT output, size_t size, const uint8_t *CXX_RESTRICT table)
- {
- for (size_t i = 0; i < size; i++)
- output[i] = table[input[i]];
- }
- }
- }
- }
|