cpu_kernels.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. /* Copyright 2019-2020 Canaan Inc.
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. #pragma once
  16. #include "../kernel_utils.h"
  17. #include <runtime/runtime_op_utility.h>
  18. namespace nncase
  19. {
  20. namespace kernels
  21. {
  22. namespace cpu
  23. {
  24. inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
  25. int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  26. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
  27. {
  28. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  29. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  30. for (int batch = 0; batch < in_shape[0]; batch++)
  31. {
  32. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  33. for (int oy = 0; oy < out_h; oy++)
  34. {
  35. for (int ox = 0; ox < out_w; ox++)
  36. {
  37. int in_y_origin = (oy * stride_h) - padding_h.before;
  38. int in_x_origin = (ox * stride_w) - padding_w.before;
  39. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  40. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  41. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  42. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  43. for (int oc = 0; oc < out_channels; oc++)
  44. {
  45. auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
  46. float value = bias[oc];
  47. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  48. {
  49. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  50. {
  51. int in_y = in_y_origin + dilation_h * ky;
  52. int in_x = in_x_origin + dilation_w * kx;
  53. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  54. auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
  55. for (int ic = 0; ic < in_shape[3]; ic++)
  56. value += in_pix[ic] * w_pix[ic];
  57. }
  58. }
  59. *output++ = details::apply_activation(value, fused_activation);
  60. }
  61. }
  62. }
  63. }
  64. }
  65. inline void depthwise_conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
  66. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  67. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
  68. {
  69. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  70. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  71. for (int batch = 0; batch < in_shape[0]; batch++)
  72. {
  73. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  74. for (int oy = 0; oy < out_h; oy++)
  75. {
  76. for (int ox = 0; ox < out_w; ox++)
  77. {
  78. int in_y_origin = (oy * stride_h) - padding_h.before;
  79. int in_x_origin = (ox * stride_w) - padding_w.before;
  80. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  81. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  82. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  83. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  84. for (int oc = 0; oc < in_shape[3]; oc++)
  85. {
  86. auto w_oc = weights + (size_t)oc * filter_h * filter_w;
  87. float value = bias[oc];
  88. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  89. {
  90. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  91. {
  92. int in_y = in_y_origin + dilation_h * ky;
  93. int in_x = in_x_origin + dilation_w * kx;
  94. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  95. auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
  96. value += in_pix[oc] * w_pix[0];
  97. }
  98. }
  99. *output++ = details::apply_activation(value, fused_activation);
  100. }
  101. }
  102. }
  103. }
  104. }
  105. template <class TBinaryOp, class TOutputOp>
  106. void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape,
  107. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  108. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
  109. {
  110. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  111. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  112. for (int batch = 0; batch < in_shape[0]; batch++)
  113. {
  114. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  115. for (int oy = 0; oy < out_h; oy++)
  116. {
  117. for (int ox = 0; ox < out_w; ox++)
  118. {
  119. int in_y_origin = (oy * stride_h) - padding_h.before;
  120. int in_x_origin = (ox * stride_w) - padding_w.before;
  121. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  122. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  123. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  124. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  125. for (int oc = 0; oc < in_shape[3]; oc++)
  126. {
  127. float value = init_value;
  128. int32_t kernel_count = 0;
  129. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  130. {
  131. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  132. {
  133. int in_y = in_y_origin + dilation_h * ky;
  134. int in_x = in_x_origin + dilation_w * kx;
  135. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  136. value = binary_op(value, in_pix[oc]);
  137. kernel_count++;
  138. }
  139. }
  140. *output++ = details::apply_activation(window_op(value, kernel_count), fused_activation);
  141. }
  142. }
  143. }
  144. }
  145. }
  146. inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
  147. int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  148. const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
  149. {
  150. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  151. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  152. for (int batch = 0; batch < in_shape[0]; batch++)
  153. {
  154. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  155. for (int oy = 0; oy < out_h; oy++)
  156. {
  157. for (int ox = 0; ox < out_w; ox++)
  158. {
  159. int in_y_origin = (oy * stride_h) - padding_h.before;
  160. int in_x_origin = (ox * stride_w) - padding_w.before;
  161. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  162. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  163. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  164. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  165. for (int oc = 0; oc < out_channels; oc++)
  166. {
  167. auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
  168. int32_t value = bias[oc];
  169. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  170. {
  171. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  172. {
  173. int in_y = in_y_origin + dilation_h * ky;
  174. int in_x = in_x_origin + dilation_w * kx;
  175. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  176. auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
  177. for (int ic = 0; ic < in_shape[3]; ic++)
  178. value += (in_pix[ic] - input_offset) * (w_pix[ic] - filter_offset);
  179. }
  180. }
  181. value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
  182. *output++ = (uint8_t)std::clamp(value, 0, 255);
  183. }
  184. }
  185. }
  186. }
  187. }
  188. inline void quantized_depthwise_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
  189. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  190. const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
  191. {
  192. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  193. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  194. for (int batch = 0; batch < in_shape[0]; batch++)
  195. {
  196. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  197. for (int oy = 0; oy < out_h; oy++)
  198. {
  199. for (int ox = 0; ox < out_w; ox++)
  200. {
  201. int in_y_origin = (oy * stride_h) - padding_h.before;
  202. int in_x_origin = (ox * stride_w) - padding_w.before;
  203. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  204. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  205. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  206. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  207. for (int oc = 0; oc < in_shape[3]; oc++)
  208. {
  209. auto w_oc = weights + (size_t)oc * filter_h * filter_w;
  210. int32_t value = bias[oc];
  211. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  212. {
  213. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  214. {
  215. int in_y = in_y_origin + dilation_h * ky;
  216. int in_x = in_x_origin + dilation_w * kx;
  217. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  218. auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
  219. value += (in_pix[oc] - input_offset) * (w_pix[0] - filter_offset);
  220. }
  221. }
  222. value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
  223. *output++ = (uint8_t)std::clamp(value, 0, 255);
  224. }
  225. }
  226. }
  227. }
  228. }
  229. }
  230. }
  231. }