arm_depthwise_conv_3x3_s8.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. /*
  2. * SPDX-FileCopyrightText: Copyright 2010-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_depthwise_conv_3x3_s8.c
  21. * Description: Optimized s8 depthwise convolution function for channel
  22. * multiplier of 1 and 3x3 kernel size.
  23. *
  24. * $Date: 5 January 2023
  25. * $Revision: V.3.2.0
  26. *
  27. * Target : Arm(R) M-Profile Architecture
  28. *
  29. * -------------------------------------------------------------------- */
  30. #include "arm_nnfunctions.h"
  31. #include "arm_nnsupportfunctions.h"
  32. /**
  33. * @ingroup Public
  34. */
  35. /**
  36. * @addtogroup NNConv
  37. * @{
  38. */
  39. /*
  40. * Optimized s8 depthwise convolution function with constraint that
  41. * in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
  42. *
  43. * Refer prototype header file for details.
  44. *
  45. */
  46. arm_cmsis_nn_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
  47. const cmsis_nn_dw_conv_params *dw_conv_params,
  48. const cmsis_nn_per_channel_quant_params *quant_params,
  49. const cmsis_nn_dims *input_dims,
  50. const int8_t *input,
  51. const cmsis_nn_dims *filter_dims,
  52. const int8_t *kernel,
  53. const cmsis_nn_dims *bias_dims,
  54. const int32_t *bias,
  55. const cmsis_nn_dims *output_dims,
  56. int8_t *output)
  57. {
  58. (void)ctx;
  59. (void)bias_dims;
  60. const int32_t input_x = input_dims->w;
  61. const int32_t input_y = input_dims->h;
  62. const int32_t input_ch = input_dims->c;
  63. const int32_t output_ch = output_dims->c;
  64. const int32_t pad_x = dw_conv_params->padding.w;
  65. const int32_t pad_y = dw_conv_params->padding.h;
  66. const int32_t stride_x = dw_conv_params->stride.w;
  67. const int32_t stride_y = dw_conv_params->stride.h;
  68. const int32_t *output_shift = quant_params->shift;
  69. const int32_t *output_mult = quant_params->multiplier;
  70. const int32_t output_x = output_dims->w;
  71. const int32_t output_y = output_dims->h;
  72. const int32_t output_offset = dw_conv_params->output_offset;
  73. const int32_t input_offset = dw_conv_params->input_offset;
  74. const int32_t output_activation_min = dw_conv_params->activation.min;
  75. const int32_t output_activation_max = dw_conv_params->activation.max;
  76. /* Check input constraints input_ch == output_ch */
  77. if (input_ch != output_ch)
  78. {
  79. return ARM_CMSIS_NN_ARG_ERROR;
  80. }
  81. /* Check input constraints pad_x <= 1 */
  82. if (pad_x > 1 || filter_dims->w != 3 || filter_dims->h != 3)
  83. {
  84. return ARM_CMSIS_NN_ARG_ERROR;
  85. }
  86. const int32_t *bias_base = bias;
  87. for (int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
  88. {
  89. for (int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
  90. {
  91. int32_t in_ch = 0;
  92. int32_t ker_w_start = MAX(0, -in_w);
  93. bias = bias_base;
  94. for (; in_ch <= (input_ch - 4); in_ch += 4)
  95. {
  96. int32_t out_buff0 = 0;
  97. int32_t out_buff1 = 0;
  98. int32_t out_buff2 = 0;
  99. int32_t out_buff3 = 0;
  100. if (bias)
  101. {
  102. out_buff0 = *bias++;
  103. out_buff1 = *bias++;
  104. out_buff2 = *bias++;
  105. out_buff3 = *bias++;
  106. }
  107. const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
  108. const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
  109. #if defined(ARM_MATH_DSP)
  110. const uint32_t lhs_offset_s16x2 = PKHBT(input_offset, input_offset, 16);
  111. for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
  112. {
  113. int32_t in_val = 0;
  114. int32_t ker_val = 0;
  115. int32_t in_val_1 = 0;
  116. int32_t ker_val_1 = 0;
  117. if (ker_w_start == 0)
  118. {
  119. in_val = arm_nn_read_s8x4(input_ptr);
  120. ker_val = arm_nn_read_s8x4(kernel_ptr);
  121. in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8);
  122. ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8);
  123. out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1);
  124. in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val);
  125. out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3);
  126. ker_val = SXTB16((uint32_t)ker_val);
  127. out_buff0 = SMLABB(in_val, ker_val, out_buff0);
  128. out_buff2 = SMLATT(in_val, ker_val, out_buff2);
  129. }
  130. in_val = arm_nn_read_s8x4(input_ptr + input_ch);
  131. ker_val = arm_nn_read_s8x4(kernel_ptr + input_ch);
  132. in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8);
  133. ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8);
  134. out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1);
  135. in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val);
  136. out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3);
  137. ker_val = SXTB16((uint32_t)ker_val);
  138. out_buff0 = SMLABB(in_val, ker_val, out_buff0);
  139. out_buff2 = SMLATT(in_val, ker_val, out_buff2);
  140. if ((input_x - in_w) >= 3)
  141. {
  142. in_val = arm_nn_read_s8x4(input_ptr + (input_ch << 1));
  143. ker_val = arm_nn_read_s8x4(kernel_ptr + (input_ch << 1));
  144. in_val_1 = SXTAB16_RORn(lhs_offset_s16x2, (uint32_t)in_val, 8);
  145. ker_val_1 = SXTB16_RORn((uint32_t)ker_val, 8);
  146. out_buff1 = SMLABB(in_val_1, ker_val_1, out_buff1);
  147. in_val = SXTAB16(lhs_offset_s16x2, (uint32_t)in_val);
  148. out_buff3 = SMLATT(in_val_1, ker_val_1, out_buff3);
  149. ker_val = SXTB16((uint32_t)ker_val);
  150. out_buff0 = SMLABB(in_val, ker_val, out_buff0);
  151. out_buff2 = SMLATT(in_val, ker_val, out_buff2);
  152. }
  153. input_ptr += (input_ch * input_x);
  154. kernel_ptr += (input_ch * 3);
  155. }
  156. #else
  157. for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
  158. {
  159. int32_t in_val = 0;
  160. int32_t ker_val = 0;
  161. if (ker_w_start == 0)
  162. {
  163. in_val = arm_nn_read_s8x4(input_ptr);
  164. ker_val = arm_nn_read_s8x4(kernel_ptr);
  165. out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
  166. out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
  167. out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
  168. out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
  169. }
  170. in_val = arm_nn_read_s8x4(input_ptr + input_ch);
  171. ker_val = arm_nn_read_s8x4(kernel_ptr + input_ch);
  172. out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
  173. out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
  174. out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
  175. out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
  176. if ((input_x - in_w) >= 3)
  177. {
  178. in_val = arm_nn_read_s8x4(input_ptr + (input_ch << 1));
  179. ker_val = arm_nn_read_s8x4(kernel_ptr + (input_ch << 1));
  180. out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
  181. out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
  182. out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
  183. out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
  184. }
  185. input_ptr += (input_ch * input_x);
  186. kernel_ptr += (input_ch * 3);
  187. }
  188. #endif
  189. out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
  190. out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
  191. out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
  192. out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
  193. out_buff0 += output_offset;
  194. out_buff1 += output_offset;
  195. out_buff2 += output_offset;
  196. out_buff3 += output_offset;
  197. out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
  198. out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
  199. out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
  200. out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
  201. output[out_idx++] = (int8_t)out_buff0;
  202. output[out_idx++] = (int8_t)out_buff1;
  203. output[out_idx++] = (int8_t)out_buff2;
  204. output[out_idx++] = (int8_t)out_buff3;
  205. }
  206. // Leftover
  207. for (; in_ch < input_ch; ++in_ch)
  208. {
  209. int32_t out_buff = 0;
  210. if (bias)
  211. {
  212. out_buff = *bias++;
  213. }
  214. const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
  215. const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
  216. for (int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
  217. {
  218. if (ker_w_start == 0)
  219. {
  220. out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
  221. }
  222. out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
  223. if ((input_x - in_w) >= 3)
  224. {
  225. out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
  226. }
  227. input_ptr += (input_ch * input_x);
  228. kernel_ptr += (input_ch * 3);
  229. }
  230. out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
  231. out_buff += output_offset;
  232. out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
  233. output[out_idx++] = (int8_t)out_buff;
  234. }
  235. }
  236. }
  237. /* Return to application */
  238. return ARM_CMSIS_NN_SUCCESS;
  239. }
  240. /**
  241. * @} end of NNConv group
  242. */