neutral_kernels.h 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. /* Copyright 2019-2020 Canaan Inc.
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. #pragma once
  16. #include "../kernel_utils.h"
  17. #include <cmath>
  18. #include <runtime/nnil.h>
  19. #include <runtime/runtime_op_utility.h>
  20. #include <xtl/xspan.hpp>
  21. #ifdef __riscv
  22. #include "../riscv/neutral_kernels.h"
  23. #endif
  24. namespace nncase
  25. {
  26. namespace kernels
  27. {
  28. namespace neutral
  29. {
  30. template <class TOp>
  31. void binary(const float *input_a, const float *input_b, float *output, const runtime_shape_t &in_a_shape,
  32. const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, const value_range<float> &fused_activation, TOp &&op)
  33. {
  34. // opt. no broadcast
  35. if (in_a_shape == in_b_shape)
  36. {
  37. auto size = kernels::details::compute_size(in_a_shape);
  38. for (size_t i = 0; i < size; i++)
  39. {
  40. const auto a = input_a[i];
  41. const auto b = input_b[i];
  42. output[i] = kernels::details::apply_activation(op(a, b), fused_activation);
  43. }
  44. }
  45. // fallback
  46. else
  47. {
  48. for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
  49. {
  50. for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
  51. {
  52. for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
  53. {
  54. for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
  55. {
  56. runtime_shape_t in_off = { d0, d1, d2, d3 };
  57. const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
  58. const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
  59. const auto a = input_a[offset(in_a_shape, in_a_off)];
  60. const auto b = input_b[offset(in_b_shape, in_b_off)];
  61. output[offset(out_shape, in_off)] = kernels::details::apply_activation(op(a, b), fused_activation);
  62. }
  63. }
  64. }
  65. }
  66. }
  67. }
  68. template <class TOp>
  69. void quantized_binary(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const runtime_shape_t &in_a_shape,
  70. const runtime_shape_t &in_b_shape, const runtime_shape_t &out_shape, int32_t input_a_offset, int32_t input_a_mul, int32_t input_a_shift,
  71. int32_t input_b_offset, int32_t input_b_mul, int32_t input_b_shift, int32_t output_mul, int32_t output_shift, int32_t output_offset, TOp &&op)
  72. {
  73. // opt. no broadcast
  74. if (in_a_shape == in_b_shape)
  75. {
  76. auto size = kernels::details::compute_size(in_a_shape);
  77. for (size_t i = 0; i < size; i++)
  78. {
  79. auto a = (int32_t)input_a[i];
  80. auto b = (int32_t)input_b[i];
  81. a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
  82. b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
  83. auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
  84. output[i] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
  85. }
  86. }
  87. // fallback
  88. else
  89. {
  90. for (int32_t d0 = 0; d0 < out_shape[0]; d0++)
  91. {
  92. for (int32_t d1 = 0; d1 < out_shape[1]; d1++)
  93. {
  94. for (int32_t d2 = 0; d2 < out_shape[2]; d2++)
  95. {
  96. for (int32_t d3 = 0; d3 < out_shape[3]; d3++)
  97. {
  98. runtime_shape_t in_off = { d0, d1, d2, d3 };
  99. const auto in_a_off = kernels::details::get_reduced_offset(in_off, in_a_shape);
  100. const auto in_b_off = kernels::details::get_reduced_offset(in_off, in_b_shape);
  101. auto a = (int32_t)input_a[offset(in_a_shape, in_a_off)];
  102. auto b = (int32_t)input_b[offset(in_b_shape, in_b_off)];
  103. a = runtime::mul_and_carry_shift(a + input_a_offset, input_a_mul, input_a_shift);
  104. b = runtime::mul_and_carry_shift(b + input_b_offset, input_b_mul, input_b_shift);
  105. auto output_val = runtime::mul_and_carry_shift(op(a, b), output_mul, output_shift);
  106. output[offset(out_shape, in_off)] = (uint8_t)std::clamp(output_val + output_offset, 0, 255);
  107. }
  108. }
  109. }
  110. }
  111. }
  112. }
  113. template <class TRange, class TPtrGetter = details::default_ptr_getter<uint8_t, TRange>>
  114. inline void concat(xtl::span<TRange> inputs, uint8_t *output, xtl::span<const int32_t> concat_dims, size_t inner_size, size_t outer_size, TPtrGetter getter = {})
  115. {
  116. for (size_t oc = 0; oc < outer_size; oc++)
  117. {
  118. for (size_t i = 0; i < inputs.size(); i++)
  119. {
  120. auto size = inner_size * concat_dims[i];
  121. auto src = getter(inputs[i]) + oc * size;
  122. std::copy(src, src + size, output);
  123. output += size;
  124. }
  125. }
  126. }
  127. inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
  128. int32_t groups, int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  129. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
  130. {
  131. const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
  132. const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
  133. const auto g_ic = in_shape[1] / groups;
  134. const auto g_oc = out_channels / groups;
  135. for (int32_t batch = 0; batch < in_shape[0]; batch++)
  136. {
  137. const float *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  138. for (int32_t og = 0; og < groups; og++)
  139. {
  140. const float *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
  141. const float *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
  142. for (int32_t oc = 0; oc < g_oc; oc++)
  143. {
  144. const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
  145. for (int32_t oy = 0; oy < out_h; oy++)
  146. {
  147. for (int32_t ox = 0; ox < out_w; ox++)
  148. {
  149. const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
  150. const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
  151. const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  152. const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
  153. const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  154. const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
  155. float value = bias[og * g_oc + oc];
  156. for (int32_t ic = 0; ic < g_ic; ic++)
  157. {
  158. const float *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
  159. const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
  160. for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
  161. {
  162. for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
  163. {
  164. const int32_t in_y = in_y_origin + dilation_h * ky;
  165. const int32_t in_x = in_x_origin + dilation_w * kx;
  166. const float in_v = in_c_p[in_y * in_shape[3] + in_x];
  167. const float w = w_ic_p[ky * filter_w + kx];
  168. value += in_v * w;
  169. }
  170. }
  171. }
  172. *output++ = details::apply_activation(value, fused_activation);
  173. }
  174. }
  175. }
  176. }
  177. }
  178. }
  179. inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, int32_t input_offset, int32_t filter_offset,
  180. int32_t output_mul, int32_t output_shift, int32_t output_offset, const runtime_shape_t &in_shape, int32_t groups, int32_t out_channels,
  181. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  182. const padding &padding_h, const padding &padding_w)
  183. {
  184. const auto out_h = details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
  185. const auto out_w = details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
  186. const auto g_ic = in_shape[1] / groups;
  187. const auto g_oc = out_channels / groups;
  188. for (int32_t batch = 0; batch < in_shape[0]; batch++)
  189. {
  190. const uint8_t *in_batch_p = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  191. for (int32_t og = 0; og < groups; og++)
  192. {
  193. const uint8_t *in_group_p = in_batch_p + (size_t)og * g_ic * in_shape[2] * in_shape[3];
  194. const uint8_t *w_group_p = weights + (size_t)og * g_oc * g_ic * filter_h * filter_w;
  195. for (int32_t oc = 0; oc < g_oc; oc++)
  196. {
  197. const uint8_t *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
  198. for (int32_t oy = 0; oy < out_h; oy++)
  199. {
  200. for (int32_t ox = 0; ox < out_w; ox++)
  201. {
  202. const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
  203. const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
  204. const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  205. const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
  206. const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  207. const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
  208. int32_t value = bias[og * g_oc + oc];
  209. for (int32_t ic = 0; ic < g_ic; ic++)
  210. {
  211. const uint8_t *in_c_p = in_group_p + (size_t)ic * in_shape[2] * in_shape[3];
  212. const uint8_t *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
  213. for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
  214. {
  215. for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
  216. {
  217. const int32_t in_y = in_y_origin + dilation_h * ky;
  218. const int32_t in_x = in_x_origin + dilation_w * kx;
  219. const int32_t in_v = (int32_t)in_c_p[in_y * in_shape[3] + in_x] + input_offset;
  220. const int32_t w = (int32_t)w_ic_p[ky * filter_w + kx] + filter_offset;
  221. value += in_v * w;
  222. }
  223. }
  224. }
  225. auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
  226. output_val += output_offset;
  227. *output++ = (uint8_t)std::clamp(output_val, 0, 255);
  228. }
  229. }
  230. }
  231. }
  232. }
  233. }
  234. inline void conv2d_transpose(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
  235. int32_t groups, const runtime_shape_t &out_shape, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  236. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
  237. {
  238. std::fill(output, output + kernels::details::compute_size(out_shape), 0.f);
  239. const auto g_ic = in_shape[1] / groups;
  240. const auto g_oc = out_shape[1] / groups;
  241. for (int32_t batch = 0; batch < in_shape[0]; batch++)
  242. {
  243. float *out_batch_p = output + (size_t)batch * out_shape[1] * out_shape[2] * out_shape[3];
  244. for (int32_t g = 0; g < groups; g++)
  245. {
  246. float *out_group_p = out_batch_p + (size_t)g * g_oc * out_shape[2] * out_shape[3];
  247. const float *w_group_p = weights + (size_t)g * g_oc * g_ic * filter_h * filter_w;
  248. for (int32_t ic = 0; ic < g_ic; ic++)
  249. {
  250. for (int32_t iy = 0; iy < in_shape[2]; iy++)
  251. {
  252. for (int32_t ix = 0; ix < in_shape[3]; ix++)
  253. {
  254. const int32_t out_y_origin = (iy * stride_h) - padding_h.before;
  255. const int32_t out_x_origin = (ix * stride_w) - padding_w.before;
  256. const int32_t filter_y_start = std::max(0, (-out_y_origin + dilation_h - 1) / dilation_h);
  257. const int32_t filter_y_end = std::min(filter_h, (out_shape[2] - out_y_origin + dilation_h - 1) / dilation_h);
  258. const int32_t filter_x_start = std::max(0, (-out_x_origin + dilation_w - 1) / dilation_w);
  259. const int32_t filter_x_end = std::min(filter_w, (out_shape[3] - out_x_origin + dilation_w - 1) / dilation_w);
  260. const float in_v = *input++;
  261. for (int32_t oc = 0; oc < g_oc; oc++)
  262. {
  263. float value = bias[g * g_oc + oc];
  264. float *out_c_p = out_group_p + (size_t)oc * out_shape[2] * out_shape[3];
  265. const float *w_oc_p = w_group_p + (size_t)oc * g_ic * filter_h * filter_w;
  266. const float *w_ic_p = w_oc_p + (size_t)ic * filter_h * filter_w;
  267. for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
  268. {
  269. for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
  270. {
  271. const int32_t out_y = out_y_origin + dilation_h * ky;
  272. const int32_t out_x = out_x_origin + dilation_w * kx;
  273. const float w = w_ic_p[ky * filter_w + kx];
  274. out_c_p[out_y * out_shape[3] + out_x] += in_v * w;
  275. }
  276. }
  277. }
  278. }
  279. }
  280. }
  281. }
  282. }
  283. if (fused_activation != value_range<float>::full())
  284. {
  285. for (size_t i = 0; i < kernels::details::compute_size(out_shape); i++)
  286. output[i] = details::apply_activation(output[i], fused_activation);
  287. }
  288. }
  289. template <class TQ>
  290. void dequantize(const TQ *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, const quant_param_t &param)
  291. {
  292. #if __riscv
  293. riscv_dequantize(input, output, count, param);
  294. #else
  295. float div = 1.f / param.scale;
  296. for (size_t i = 0; i < count; i++)
  297. {
  298. output[i] = (input[i] - param.zero_point) * div;
  299. }
  300. #endif
  301. }
  302. inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
  303. {
  304. for (size_t oy = 0; oy < a_rows; oy++)
  305. {
  306. for (size_t ox = 0; ox < b_cols; ox++)
  307. {
  308. float value = bias[ox];
  309. for (size_t i = 0; i < a_cols; i++)
  310. {
  311. const auto a = input_a[oy * a_cols + i];
  312. const auto b = input_b[i * b_cols + ox];
  313. value += a * b;
  314. }
  315. output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
  316. }
  317. }
  318. }
  319. inline void quantized_matmul(const uint8_t *input_a, const uint8_t *input_b, uint8_t *output, const int32_t *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, int32_t input_a_offset, int32_t input_b_offset,
  320. int32_t output_mul, int32_t output_shift, int32_t output_offset)
  321. {
  322. for (size_t oy = 0; oy < a_rows; oy++)
  323. {
  324. for (size_t ox = 0; ox < b_cols; ox++)
  325. {
  326. int32_t value = bias[ox];
  327. for (size_t i = 0; i < a_cols; i++)
  328. {
  329. const auto a = (int32_t)input_a[oy * a_cols + i] + input_a_offset;
  330. const auto b = (int32_t)input_b[i * b_cols + ox] + input_b_offset;
  331. value += a * b;
  332. }
  333. auto output_val = static_cast<int32_t>(runtime::mul_and_carry_shift(value, output_mul, output_shift));
  334. output_val += output_offset;
  335. output[oy * b_cols + ox] = (uint8_t)std::clamp(output_val, 0, 255);
  336. }
  337. }
  338. }
  339. template <class T>
  340. void pad(const T *input, T *output, const runtime_shape_t &in_shape, const runtime_paddings_t &paddings, T pad_value)
  341. {
  342. runtime_shape_t out_shape = { in_shape[0] + paddings[0].sum(),
  343. in_shape[1] + paddings[1].sum(),
  344. in_shape[2] + paddings[2].sum(),
  345. in_shape[3] + paddings[3].sum() };
  346. for (int d0 = 0; d0 < out_shape[0]; d0++)
  347. {
  348. auto d0_origin = -paddings[0].before;
  349. auto in0 = input + ((size_t)d0_origin + d0) * in_shape[1] * in_shape[2] * in_shape[3];
  350. for (int d1 = 0; d1 < out_shape[1]; d1++)
  351. {
  352. auto d1_origin = -paddings[1].before;
  353. auto in1 = in0 + ((size_t)d1_origin + d1) * in_shape[2] * in_shape[3];
  354. for (int d2 = 0; d2 < out_shape[2]; d2++)
  355. {
  356. auto d2_origin = -paddings[2].before;
  357. auto in2 = in1 + ((size_t)d2_origin + d2) * in_shape[3];
  358. for (int d3 = 0; d3 < out_shape[3]; d3++)
  359. {
  360. auto d3_origin = -paddings[3].before;
  361. if (d0 < paddings[0].before || d0 >= out_shape[0] - paddings[0].after
  362. || d1 < paddings[1].before || d1 >= out_shape[1] - paddings[1].after
  363. || d2 < paddings[2].before || d2 >= out_shape[2] - paddings[2].after
  364. || d3 < paddings[3].before || d3 >= out_shape[3] - paddings[3].after)
  365. *output++ = pad_value;
  366. else
  367. *output++ = in2[d3_origin + d3];
  368. }
  369. }
  370. }
  371. }
  372. }
  373. template <class TQ>
  374. void quantize(const float *CXX_RESTRICT input, TQ *CXX_RESTRICT output, size_t count, const quant_param_t &param)
  375. {
  376. #if __riscv
  377. riscv_quantize(input, output, count, param);
  378. #else
  379. for (size_t i = 0; i < count; i++)
  380. {
  381. int32_t tmp = (int32_t)roundf(input[i] * param.scale + param.zero_point);
  382. output[i] = std::clamp(tmp, (int32_t)std::numeric_limits<TQ>::lowest(), (int32_t)std::numeric_limits<TQ>::max());
  383. }
  384. #endif
  385. }
  386. template <class TReducer>
  387. void reduce(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, const runtime_shape_t &reduced_shape, TReducer &&reducer)
  388. {
  389. std::fill(output, output + kernels::details::compute_size(reduced_shape), init_value);
  390. for (int32_t d0 = 0; d0 < in_shape[0]; d0++)
  391. {
  392. for (int32_t d1 = 0; d1 < in_shape[1]; d1++)
  393. {
  394. for (int32_t d2 = 0; d2 < in_shape[2]; d2++)
  395. {
  396. for (int32_t d3 = 0; d3 < in_shape[3]; d3++)
  397. {
  398. runtime_shape_t in_off = { d0, d1, d2, d3 };
  399. auto out_off = kernels::details::get_reduced_offset(in_off, reduced_shape);
  400. const auto a = input[offset(in_shape, in_off)];
  401. auto &b = output[offset(reduced_shape, out_off)];
  402. b = reducer(b, a);
  403. }
  404. }
  405. }
  406. }
  407. }
  408. template <class TOp>
  409. void unary(const float *CXX_RESTRICT input, float *CXX_RESTRICT output, size_t count, TOp &&op)
  410. {
  411. for (size_t i = 0; i < count; i++)
  412. output[i] = op(input[i]);
  413. }
  414. template <class TBinaryOp, class TOutputOp>
  415. void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape, int32_t filter_h, int32_t filter_w,
  416. int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w, const padding &padding_h, const padding &padding_w,
  417. const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
  418. {
  419. const auto out_h = kernels::details::get_windowed_output_size(in_shape[2], filter_h, stride_h, dilation_h, padding_h);
  420. const auto out_w = kernels::details::get_windowed_output_size(in_shape[3], filter_w, stride_w, dilation_w, padding_w);
  421. runtime_shape_t out_shape { in_shape[0], in_shape[1], out_h, out_w };
  422. for (int32_t batch = 0; batch < in_shape[0]; batch++)
  423. {
  424. for (int32_t oc = 0; oc < in_shape[1]; oc++)
  425. {
  426. for (int32_t oy = 0; oy < out_h; oy++)
  427. {
  428. for (int32_t ox = 0; ox < out_w; ox++)
  429. {
  430. const int32_t in_y_origin = (oy * stride_h) - padding_h.before;
  431. const int32_t in_x_origin = (ox * stride_w) - padding_w.before;
  432. const int32_t filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  433. const int32_t filter_y_end = std::min(filter_h, (in_shape[2] - in_y_origin + dilation_h - 1) / dilation_h);
  434. const int32_t filter_x_start = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  435. const int32_t filter_x_end = std::min(filter_w, (in_shape[3] - in_x_origin + dilation_w - 1) / dilation_w);
  436. float value = init_value;
  437. int32_t kernel_count = 0;
  438. for (int32_t ky = filter_y_start; ky < filter_y_end; ky++)
  439. {
  440. for (int32_t kx = filter_x_start; kx < filter_x_end; kx++)
  441. {
  442. const int32_t in_y = in_y_origin + dilation_h * ky;
  443. const int32_t in_x = in_x_origin + dilation_w * kx;
  444. const float in_v = input[offset(in_shape, { batch, oc, in_y, in_x })];
  445. value = binary_op(value, in_v);
  446. kernel_count++;
  447. }
  448. }
  449. output[offset(out_shape, { batch, oc, oy, ox })] = kernels::details::apply_activation(window_op(value, kernel_count), fused_activation);
  450. }
  451. }
  452. }
  453. }
  454. }
  455. template <class T>
  456. void resize_nearest_neighbor(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w)
  457. {
  458. auto height_scale = (float)in_shape[2] / out_h;
  459. auto width_scale = (float)in_shape[3] / out_w;
  460. for (int batch = 0; batch < in_shape[0]; batch++)
  461. {
  462. auto in_batch = input + batch * in_shape[1] * in_shape[2] * in_shape[3];
  463. for (int oc = 0; oc < in_shape[1]; oc++)
  464. {
  465. auto in_c = in_batch + oc * in_shape[2] * in_shape[3];
  466. for (int oy = 0; oy < out_h; oy++)
  467. {
  468. auto in_y = std::min((int32_t)floorf(oy * height_scale), in_shape[2] - 1);
  469. auto in_row = in_c + in_y * in_shape[3];
  470. for (int ox = 0; ox < out_w; ox++)
  471. {
  472. auto in_x = std::min((int32_t)floorf(ox * width_scale), in_shape[3] - 1);
  473. *output++ = in_row[in_x];
  474. }
  475. }
  476. }
  477. }
  478. }
  479. template <class T>
  480. inline void resize_bilinear(const T *input, T *output, const runtime_shape_t &in_shape, int32_t out_h, int32_t out_w, bool align_corners)
  481. {
  482. auto height_scale = (float)in_shape[2] / out_h;
  483. auto width_scale = (float)in_shape[3] / out_w;
  484. if (align_corners && out_h > 1)
  485. height_scale = (float)(in_shape[2] - 1) / (out_h - 1);
  486. if (align_corners && out_w > 1)
  487. width_scale = (float)(in_shape[3] - 1) / (out_w - 1);
  488. auto destIdx = 0;
  489. for (int batch = 0; batch < in_shape[0]; batch++)
  490. {
  491. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  492. for (int oc = 0; oc < in_shape[1]; oc++)
  493. {
  494. auto in_c = in_batch + (size_t)oc * in_shape[2] * in_shape[3];
  495. for (int oy = 0; oy < out_h; oy++)
  496. {
  497. auto in_y = oy * height_scale;
  498. auto in_y0 = (int)floorf(in_y);
  499. auto in_y1 = std::min(in_y0 + 1, in_shape[2] - 1);
  500. for (int ox = 0; ox < out_w; ox++)
  501. {
  502. auto in_x = ox * width_scale;
  503. auto in_x0 = (int)floorf(in_x);
  504. auto in_x1 = std::min(in_x0 + 1, in_shape[3] - 1);
  505. auto v0 = in_c[in_y0 * in_shape[3] + in_x0];
  506. auto v1 = in_c[in_y1 * in_shape[3] + in_x0];
  507. auto v2 = in_c[in_y0 * in_shape[3] + in_x1];
  508. auto v3 = in_c[in_y1 * in_shape[3] + in_x1];
  509. auto a0 = (1 - (in_y - in_y0)) * (1 - (in_x - in_x0));
  510. auto a1 = (in_y - in_y0) * (1 - (in_x - in_x0));
  511. auto a2 = (1 - (in_y - in_y0)) * (in_x - in_x0);
  512. auto a3 = (in_y - in_y0) * (in_x - in_x0);
  513. output[destIdx++] = T(v0 * a0 + v1 * a1 + v2 * a2 + v3 * a3);
  514. }
  515. }
  516. }
  517. }
  518. }
  519. inline void softmax(const float *input, float *output, float beta, int32_t outer_size, size_t inner_size)
  520. {
  521. for (size_t batch = 0; batch < outer_size; batch++)
  522. {
  523. auto src = input + batch * inner_size;
  524. auto dest = output + batch * inner_size;
  525. auto max = *std::max_element(src, src + inner_size);
  526. float sum = 0;
  527. for (size_t i = 0; i < inner_size; i++)
  528. {
  529. auto value = expf((src[i] - max) * beta);
  530. sum += value;
  531. dest[i] = value;
  532. }
  533. for (size_t i = 0; i < inner_size; i++)
  534. dest[i] /= sum;
  535. }
  536. }
  537. template <class T>
  538. void transpose(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const runtime_shape_t &in_shape, const runtime_shape_t &perm)
  539. {
  540. runtime_shape_t out_shape;
  541. for (size_t i = 0; i < 4; i++)
  542. out_shape[i] = in_shape[perm[i]];
  543. runtime_shape_t i, o;
  544. for (o[3] = 0; o[3] < out_shape[3]; o[3]++)
  545. {
  546. i[perm[3]] = o[3];
  547. for (o[2] = 0; o[2] < out_shape[2]; o[2]++)
  548. {
  549. i[perm[2]] = o[2];
  550. for (o[1] = 0; o[1] < out_shape[1]; o[1]++)
  551. {
  552. i[perm[1]] = o[1];
  553. for (o[0] = 0; o[0] < out_shape[0]; o[0]++)
  554. {
  555. i[perm[0]] = o[0];
  556. output[offset(out_shape, o)] = input[offset(in_shape, i)];
  557. }
  558. }
  559. }
  560. }
  561. }
  562. template <class T>
  563. void strided_slice(const T *CXX_RESTRICT input, T *CXX_RESTRICT output, const runtime_shape_t &in_shape, const runtime_shape_t &begin, const runtime_shape_t &end, const runtime_shape_t &strides)
  564. {
  565. auto loop_cond = [](int32_t i, int32_t stop, int32_t stride) {
  566. return stride > 0 ? i < stop : i > stop;
  567. };
  568. for (int32_t d0 = begin[0]; loop_cond(d0, end[0], strides[0]); d0 += strides[0])
  569. {
  570. auto d0_origin = input + (size_t)d0 * in_shape[1] * in_shape[2] * in_shape[3];
  571. for (int d1 = begin[1]; loop_cond(d1, end[1], strides[1]); d1 += strides[1])
  572. {
  573. auto d1_origin = d0_origin + (size_t)d1 * in_shape[2] * in_shape[3];
  574. for (int32_t d2 = begin[2]; loop_cond(d2, end[2], strides[2]); d2 += strides[2])
  575. {
  576. auto d2_origin = d1_origin + (size_t)d2 * in_shape[3];
  577. for (int32_t d3 = begin[3]; loop_cond(d3, end[3], strides[3]); d3 += strides[3])
  578. *output++ = d2_origin[d3];
  579. }
  580. }
  581. }
  582. }
  583. inline void nnil_unary_method(const float *input, float *output, size_t count, xtl::span<const uint8_t> body)
  584. {
  585. using namespace nncase::runtime;
  586. for (size_t i = 0; i < count; i++)
  587. {
  588. nnil_evalstack stack;
  589. span_reader sr(body);
  590. nnil_reader reader(sr);
  591. bool ret = false;
  592. while (reader.avail() && !ret)
  593. {
  594. auto op = reader.next();
  595. switch (op.opcode)
  596. {
  597. case nnil_nop:
  598. break;
  599. case nnil_dup:
  600. stack.dup();
  601. break;
  602. case nnil_pop:
  603. stack.pop();
  604. break;
  605. case nnil_lda_0:
  606. stack.push(input[i]);
  607. break;
  608. case nnil_ldc_r4_0:
  609. stack.push(0.f);
  610. break;
  611. case nnil_ldc_r4_1:
  612. stack.push(1.f);
  613. break;
  614. case nnil_ldc_r4:
  615. stack.push(op.ldc_r4.r4);
  616. break;
  617. case nnil_abs:
  618. stack.push(fabsf(stack.pop()));
  619. break;
  620. case nnil_ceil:
  621. stack.push(ceilf(stack.pop()));
  622. break;
  623. case nnil_cos:
  624. stack.push(cosf(stack.pop()));
  625. break;
  626. case nnil_exp:
  627. stack.push(expf(stack.pop()));
  628. break;
  629. case nnil_floor:
  630. stack.push(floorf(stack.pop()));
  631. break;
  632. case nnil_log:
  633. stack.push(logf(stack.pop()));
  634. break;
  635. case nnil_neg:
  636. stack.push(-stack.pop());
  637. break;
  638. case nnil_rsqrt:
  639. stack.push(1.f / sqrtf(stack.pop()));
  640. break;
  641. case nnil_sin:
  642. stack.push(sinf(stack.pop()));
  643. break;
  644. case nnil_square:
  645. {
  646. auto v = stack.pop();
  647. stack.push(v * v);
  648. break;
  649. }
  650. case nnil_add:
  651. {
  652. auto b = stack.pop();
  653. auto a = stack.pop();
  654. stack.push(a + b);
  655. break;
  656. }
  657. case nnil_sub:
  658. {
  659. auto b = stack.pop();
  660. auto a = stack.pop();
  661. stack.push(a - b);
  662. break;
  663. }
  664. case nnil_mul:
  665. {
  666. auto b = stack.pop();
  667. auto a = stack.pop();
  668. stack.push(a * b);
  669. break;
  670. }
  671. case nnil_div:
  672. {
  673. auto b = stack.pop();
  674. auto a = stack.pop();
  675. stack.push(a / b);
  676. break;
  677. }
  678. case nnil_min:
  679. {
  680. auto b = stack.pop();
  681. auto a = stack.pop();
  682. stack.push(std::min(a, b));
  683. break;
  684. }
  685. case nnil_max:
  686. {
  687. auto b = stack.pop();
  688. auto a = stack.pop();
  689. stack.push(std::max(a, b));
  690. break;
  691. }
  692. case nnil_clamp:
  693. {
  694. auto high = stack.pop();
  695. auto low = stack.pop();
  696. auto v = stack.pop();
  697. stack.push(std::clamp(v, low, high));
  698. break;
  699. }
  700. case nnil_ret:
  701. output[i] = stack.pop();
  702. ret = true;
  703. break;
  704. default:
  705. NNCASE_THROW(std::runtime_error, "Invalid nnil op");
  706. break;
  707. }
  708. }
  709. }
  710. }
  711. inline void table_lookup1d(const uint8_t *CXX_RESTRICT input, uint8_t *CXX_RESTRICT output, size_t size, const uint8_t *CXX_RESTRICT table)
  712. {
  713. for (size_t i = 0; i < size; i++)
  714. output[i] = table[input[i]];
  715. }
  716. }
  717. }
  718. }