cpu_kernels.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. #pragma once
  2. #include "../utils.h"
  3. #include <runtime_op_utility.h>
  4. namespace nncase
  5. {
  6. namespace kernels
  7. {
  8. namespace cpu
  9. {
  10. inline void conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
  11. int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  12. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
  13. {
  14. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  15. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  16. for (int batch = 0; batch < in_shape[0]; batch++)
  17. {
  18. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  19. for (int oy = 0; oy < out_h; oy++)
  20. {
  21. for (int ox = 0; ox < out_w; ox++)
  22. {
  23. int in_y_origin = (oy * stride_h) - padding_h.before;
  24. int in_x_origin = (ox * stride_w) - padding_w.before;
  25. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  26. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  27. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  28. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  29. for (int oc = 0; oc < out_channels; oc++)
  30. {
  31. auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
  32. float value = bias[oc];
  33. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  34. {
  35. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  36. {
  37. int in_y = in_y_origin + dilation_h * ky;
  38. int in_x = in_x_origin + dilation_w * kx;
  39. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  40. auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
  41. for (int ic = 0; ic < in_shape[3]; ic++)
  42. value += in_pix[ic] * w_pix[ic];
  43. }
  44. }
  45. *output++ = details::apply_activation(value, fused_activation);
  46. }
  47. }
  48. }
  49. }
  50. }
  51. inline void depthwise_conv2d(const float *input, float *output, const float *weights, const float *bias, const runtime_shape_t &in_shape,
  52. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  53. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation)
  54. {
  55. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  56. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  57. for (int batch = 0; batch < in_shape[0]; batch++)
  58. {
  59. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  60. for (int oy = 0; oy < out_h; oy++)
  61. {
  62. for (int ox = 0; ox < out_w; ox++)
  63. {
  64. int in_y_origin = (oy * stride_h) - padding_h.before;
  65. int in_x_origin = (ox * stride_w) - padding_w.before;
  66. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  67. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  68. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  69. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  70. for (int oc = 0; oc < in_shape[3]; oc++)
  71. {
  72. auto w_oc = weights + (size_t)oc * filter_h * filter_w;
  73. float value = bias[oc];
  74. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  75. {
  76. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  77. {
  78. int in_y = in_y_origin + dilation_h * ky;
  79. int in_x = in_x_origin + dilation_w * kx;
  80. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  81. auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
  82. value += in_pix[oc] * w_pix[0];
  83. }
  84. }
  85. *output++ = details::apply_activation(value, fused_activation);
  86. }
  87. }
  88. }
  89. }
  90. }
  91. template <class TBinaryOp, class TOutputOp>
  92. void reduce_window2d(const float *input, float *output, float init_value, const runtime_shape_t &in_shape,
  93. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  94. const padding &padding_h, const padding &padding_w, const value_range<float> &fused_activation, TBinaryOp &&binary_op, TOutputOp &&window_op)
  95. {
  96. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  97. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  98. for (int batch = 0; batch < in_shape[0]; batch++)
  99. {
  100. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  101. for (int oy = 0; oy < out_h; oy++)
  102. {
  103. for (int ox = 0; ox < out_w; ox++)
  104. {
  105. int in_y_origin = (oy * stride_h) - padding_h.before;
  106. int in_x_origin = (ox * stride_w) - padding_w.before;
  107. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  108. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  109. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  110. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  111. for (int oc = 0; oc < in_shape[3]; oc++)
  112. {
  113. float value = init_value;
  114. int32_t kernel_count = 0;
  115. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  116. {
  117. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  118. {
  119. int in_y = in_y_origin + dilation_h * ky;
  120. int in_x = in_x_origin + dilation_w * kx;
  121. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  122. value = binary_op(value, in_pix[oc]);
  123. kernel_count++;
  124. }
  125. }
  126. *output++ = details::apply_activation(window_op(value, kernel_count), fused_activation);
  127. }
  128. }
  129. }
  130. }
  131. }
  132. inline void quantized_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
  133. int32_t out_channels, int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  134. const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
  135. {
  136. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  137. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  138. for (int batch = 0; batch < in_shape[0]; batch++)
  139. {
  140. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  141. for (int oy = 0; oy < out_h; oy++)
  142. {
  143. for (int ox = 0; ox < out_w; ox++)
  144. {
  145. int in_y_origin = (oy * stride_h) - padding_h.before;
  146. int in_x_origin = (ox * stride_w) - padding_w.before;
  147. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  148. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  149. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  150. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  151. for (int oc = 0; oc < out_channels; oc++)
  152. {
  153. auto w_oc = weights + (size_t)oc * filter_h * filter_w * in_shape[3];
  154. int32_t value = bias[oc];
  155. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  156. {
  157. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  158. {
  159. int in_y = in_y_origin + dilation_h * ky;
  160. int in_x = in_x_origin + dilation_w * kx;
  161. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  162. auto w_pix = w_oc + ((size_t)ky * filter_w + kx) * in_shape[3];
  163. for (int ic = 0; ic < in_shape[3]; ic++)
  164. value += (in_pix[ic] - input_offset) * (w_pix[ic] - filter_offset);
  165. }
  166. }
  167. value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
  168. *output++ = (uint8_t)std::clamp(value, 0, 255);
  169. }
  170. }
  171. }
  172. }
  173. }
  174. inline void quantized_depthwise_conv2d(const uint8_t *input, uint8_t *output, const uint8_t *weights, const int32_t *bias, const runtime_shape_t &in_shape,
  175. int32_t filter_h, int32_t filter_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w,
  176. const padding &padding_h, const padding &padding_w, int32_t input_offset, int32_t filter_offset, int32_t output_mul, int32_t output_shift, int32_t output_offset)
  177. {
  178. const auto out_h = details::get_windowed_output_size(in_shape[1], filter_h, stride_h, dilation_h, padding_h);
  179. const auto out_w = details::get_windowed_output_size(in_shape[2], filter_w, stride_w, dilation_w, padding_w);
  180. for (int batch = 0; batch < in_shape[0]; batch++)
  181. {
  182. auto in_batch = input + (size_t)batch * in_shape[1] * in_shape[2] * in_shape[3];
  183. for (int oy = 0; oy < out_h; oy++)
  184. {
  185. for (int ox = 0; ox < out_w; ox++)
  186. {
  187. int in_y_origin = (oy * stride_h) - padding_h.before;
  188. int in_x_origin = (ox * stride_w) - padding_w.before;
  189. int filter_y_start = std::max(0, (-in_y_origin + dilation_h - 1) / dilation_h);
  190. int filter_y_end = std::min(filter_h, (in_shape[1] - in_y_origin + dilation_h - 1) / dilation_h);
  191. int filter_xSstart = std::max(0, (-in_x_origin + dilation_w - 1) / dilation_w);
  192. int filter_x_end = std::min(filter_w, (in_shape[2] - in_x_origin + dilation_w - 1) / dilation_w);
  193. for (int oc = 0; oc < in_shape[3]; oc++)
  194. {
  195. auto w_oc = weights + (size_t)oc * filter_h * filter_w;
  196. int32_t value = bias[oc];
  197. for (int ky = filter_y_start; ky < filter_y_end; ky++)
  198. {
  199. for (int kx = filter_xSstart; kx < filter_x_end; kx++)
  200. {
  201. int in_y = in_y_origin + dilation_h * ky;
  202. int in_x = in_x_origin + dilation_w * kx;
  203. auto in_pix = in_batch + ((size_t)in_y * in_shape[2] + in_x) * in_shape[3];
  204. auto w_pix = w_oc + ((size_t)ky * filter_w + kx);
  205. value += (in_pix[oc] - input_offset) * (w_pix[0] - filter_offset);
  206. }
  207. }
  208. value = runtime::mul_and_carry_shift(value, output_mul, output_shift) + output_offset;
  209. *output++ = (uint8_t)std::clamp(value, 0, 255);
  210. }
  211. }
  212. }
  213. }
  214. }
  215. }
  216. }
  217. }