arm_depthwise_conv_s8_opt.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. /*
  2. * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_depthwise_conv_s8_opt.c
  21. * Description: Optimized s8 depthwise separable convolution function for
  22. * channel multiplier of 1.
  23. *
  24. * $Date: February 27, 2020
  25. * $Revision: V.1.0.1
  26. *
  27. * Target Processor: Cortex-M cores
  28. *
  29. * -------------------------------------------------------------------- */
  30. #include "arm_math.h"
  31. #include "arm_nnsupportfunctions.h"
  32. #include "arm_nnfunctions.h"
  33. /**
  34. * @ingroup groupNN
  35. */
  36. /**
  37. * @addtogroup NNConv
  38. * @{
  39. */
  40. /*
  41. * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
  42. *
  43. * Refer prototype header file for details.
  44. *
  45. */
  46. arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
  47. const uint16_t input_x,
  48. const uint16_t input_y,
  49. const uint16_t input_ch,
  50. const q7_t *kernel,
  51. const uint16_t output_ch,
  52. const uint16_t kernel_x,
  53. const uint16_t kernel_y,
  54. const uint16_t pad_x,
  55. const uint16_t pad_y,
  56. const uint16_t stride_x,
  57. const uint16_t stride_y,
  58. const int32_t *bias,
  59. q7_t *output,
  60. const int32_t *output_shift,
  61. const int32_t *output_mult,
  62. const uint16_t output_x,
  63. const uint16_t output_y,
  64. const int32_t output_offset,
  65. const int32_t input_offset,
  66. const int32_t output_activation_min,
  67. const int32_t output_activation_max,
  68. const uint16_t dilation_x,
  69. const uint16_t dilation_y,
  70. q15_t *buffer_a)
  71. {
  72. /* Check input constraints input_ch == output_ch */
  73. if (input_ch != output_ch)
  74. {
  75. return ARM_MATH_SIZE_MISMATCH;
  76. }
  77. #ifdef ARM_MATH_MVEI
  78. (void)dilation_x;
  79. (void)dilation_y;
  80. /* Generate two columns from the input tensor */
  81. q15_t *two_column_buf = buffer_a;
  82. q7_t *out = output;
  83. /* This part implements the im2col function */
  84. for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
  85. {
  86. const int32_t base_idx_y = i_out_y * stride_y - pad_y;
  87. for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
  88. {
  89. const int32_t base_idx_x = (i_out_x * stride_x) - pad_x;
  90. for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
  91. {
  92. for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
  93. {
  94. if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
  95. {
  96. /* Filling 0 for out-of-bound paddings */
  97. memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
  98. }
  99. else
  100. {
  101. /* Copying the pixel data to column */
  102. arm_q7_to_q15_with_offset(input + (i_ker_y * input_x + i_ker_x) * input_ch, two_column_buf, input_ch, input_offset);
  103. }
  104. two_column_buf += input_ch;
  105. }
  106. }
  107. /* Computation is filed for every 2 columns */
  108. if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
  109. {
  110. two_column_buf = buffer_a;
  111. out = arm_nn_depthwise_conv_s8_core(kernel,
  112. buffer_a,
  113. output_ch,
  114. output_shift,
  115. output_mult,
  116. output_offset,
  117. output_activation_min,
  118. output_activation_max,
  119. kernel_x * kernel_y,
  120. bias,
  121. out);
  122. }
  123. }
  124. }
  125. /* left-over pixels */
  126. if (two_column_buf != buffer_a)
  127. {
  128. int32_t ch_count = (output_ch + 3) / 4;
  129. const int32_t *out_bias = bias;
  130. int32_t idx = 0;
  131. int32_t out_ch = output_ch;
  132. while (ch_count > 0)
  133. {
  134. int32_t ker_count = kernel_x * kernel_y;
  135. const int32_t offset = idx * 4;
  136. const int8_t *row = kernel + offset;
  137. int16_t *col = buffer_a + offset;
  138. mve_pred16_t p = vctp32q(out_ch);
  139. int32x4_t res = vldrwq_z_s32(out_bias, p);
  140. out_bias += 4;
  141. while (ker_count > 0)
  142. {
  143. const int32x4_t ip = vldrhq_z_s32(col, p);
  144. const int32x4_t ker = vldrbq_z_s32(row, p);
  145. col += output_ch;
  146. row += output_ch;
  147. res += vmlasq_n_s32(ip, ker, 0);
  148. ker_count--;
  149. }
  150. int32x4_t mult = vldrwq_z_s32(output_mult, p);
  151. int32x4_t shift = vldrwq_z_s32(output_shift, p);
  152. output_mult += 4;
  153. output_shift += 4;
  154. res = arm_requantize_mve_32x4(res, mult, shift);
  155. res = vaddq_n_s32(res, output_offset);
  156. res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
  157. res = vminq_s32(res, vdupq_n_s32(output_activation_max));
  158. vstrbq_p_s32(out, res, p);
  159. out += 4;
  160. idx++;
  161. out_ch -= 4;
  162. ch_count--;
  163. }
  164. }
  165. #elif defined(ARM_MATH_DSP)
  166. /* Run the following code in cores using DSP extension */
  167. (void)dilation_x;
  168. (void)dilation_y;
  169. q15_t *const col_buffer_start = buffer_a;
  170. q15_t *col_buffer = col_buffer_start;
  171. const int32_t *const bias_start_pos = bias;
  172. const q31_t *const out_mult_start_pos = output_mult;
  173. const q31_t *const out_shift_start_pos = output_shift;
  174. uint16_t row_count;
  175. uint16_t row_shift;
  176. for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
  177. {
  178. const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
  179. for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
  180. {
  181. const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
  182. /* Out of bounds is only considered for the y axis as it provides a contiguous zero'ing opportunity than along
  183. the x axis */
  184. const int ker_y_start = MAX(0, -base_idx_y);
  185. /* Condition for kernel end dimension: (base_idx_y + ker_y_end) < input_y */
  186. const int ker_y_end = MIN(kernel_y, input_y - base_idx_y);
  187. int32_t index = 0;
  188. if (ker_y_start != 0)
  189. {
  190. memset(&col_buffer[index], 0, (kernel_x * input_ch) * ker_y_start * sizeof(q15_t));
  191. index += (kernel_x * input_ch) * ker_y_start;
  192. }
  193. for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
  194. {
  195. const int32_t idx_y = base_idx_y + i_ker_y;
  196. for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
  197. {
  198. const int32_t idx_x = base_idx_x + i_ker_x;
  199. if (idx_x < 0 || idx_x >= input_x)
  200. {
  201. memset(&col_buffer[index], 0, input_ch * sizeof(q15_t));
  202. }
  203. else
  204. {
  205. arm_q7_to_q15_with_offset((q7_t *)input + (idx_y * input_x + idx_x) * input_ch, &col_buffer[index], input_ch, input_offset);
  206. }
  207. index += input_ch;
  208. }
  209. }
  210. const int diff = kernel_y - ker_y_end;
  211. if (diff != 0)
  212. {
  213. memset(&col_buffer[index], 0, (kernel_x * input_ch) * diff * sizeof(q15_t));
  214. }
  215. row_count = output_ch / 4;
  216. row_shift = 0;
  217. bias = bias_start_pos;
  218. output_mult = out_mult_start_pos;
  219. output_shift = out_shift_start_pos;
  220. while (row_count)
  221. {
  222. q31_t sum = *bias++;
  223. q31_t sum_2 = *bias++;
  224. q31_t sum_3 = *bias++;
  225. q31_t sum_4 = *bias++;
  226. uint16_t col_count = (kernel_x * kernel_y) / 2;
  227. q15_t *col_pos = col_buffer_start + row_shift;
  228. const q7_t *row_pos = kernel + row_shift;
  229. row_shift += 4;
  230. while (col_count)
  231. {
  232. /* General idea is to read 4 + 4 (input, kernel) pair and re-arrange them in the right order to
  233. use in a SMLAD instruction . One run of this loop produces 4 partial outputs with 8 MACs. */
  234. /* Note: variable names can be improved here to align with rows and columns. */
  235. q31_t ip_a1, ip_a2, ip_b1, ip_b2, op_a, op_b, op_c;
  236. /* Read 4 weights */
  237. ip_b1 = arm_nn_read_q7x4(row_pos);
  238. ip_a1 = arm_nn_read_q7x4(row_pos + input_ch);
  239. op_a = arm_nn_read_q15x2(col_pos);
  240. op_b = arm_nn_read_q15x2(col_pos + input_ch);
  241. ip_a2 = __SXTB16(ip_b1);
  242. ip_b1 = __SXTB16(__ROR(ip_b1, 8));
  243. ip_b2 = __SXTB16(ip_a1);
  244. ip_a1 = __SXTB16(__ROR(ip_a1, 8));
  245. op_c = __PKHBT(op_b, op_a, 16);
  246. op_a = __PKHTB(op_b, op_a, 16);
  247. op_b = __PKHBT(ip_b2, ip_a2, 16);
  248. sum = __SMLAD(op_c, op_b, sum);
  249. op_b = __PKHBT(ip_b1, ip_a1, 16);
  250. sum_2 = __SMLAD(op_a, op_b, sum_2);
  251. op_a = arm_nn_read_q15x2(col_pos + 2);
  252. op_b = arm_nn_read_q15x2(col_pos + input_ch + 2);
  253. op_c = __PKHBT(op_b, op_a, 16);
  254. op_a = __PKHTB(op_b, op_a, 16);
  255. op_b = __PKHTB(ip_a2, ip_b2, 16);
  256. sum_3 = __SMLAD(op_c, op_b, sum_3);
  257. op_b = __PKHTB(ip_a1, ip_b1, 16);
  258. sum_4 = __SMLAD(op_a, op_b, sum_4);
  259. row_pos += input_ch << 1;
  260. col_pos += input_ch << 1;
  261. col_count--;
  262. }
  263. col_count = (kernel_x * kernel_y) & 0x1;
  264. while (col_count)
  265. {
  266. sum += row_pos[0] * col_pos[0];
  267. sum_2 += row_pos[1] * col_pos[1];
  268. sum_3 += row_pos[2] * col_pos[2];
  269. sum_4 += row_pos[3] * col_pos[3];
  270. row_pos += input_ch;
  271. col_pos += input_ch;
  272. col_count--;
  273. }
  274. sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
  275. sum += output_offset;
  276. sum = MAX(sum, output_activation_min);
  277. sum = MIN(sum, output_activation_max);
  278. *output++ = (q7_t)sum;
  279. sum_2 = arm_nn_requantize(sum_2, *output_mult++, *output_shift++);
  280. sum_2 += output_offset;
  281. sum_2 = MAX(sum_2, output_activation_min);
  282. sum_2 = MIN(sum_2, output_activation_max);
  283. *output++ = (q7_t)sum_2;
  284. sum_3 = arm_nn_requantize(sum_3, *output_mult++, *output_shift++);
  285. sum_3 += output_offset;
  286. sum_3 = MAX(sum_3, output_activation_min);
  287. sum_3 = MIN(sum_3, output_activation_max);
  288. *output++ = (q7_t)sum_3;
  289. sum_4 = arm_nn_requantize(sum_4, *output_mult++, *output_shift++);
  290. sum_4 += output_offset;
  291. sum_4 = MAX(sum_4, output_activation_min);
  292. sum_4 = MIN(sum_4, output_activation_max);
  293. *output++ = (q7_t)sum_4;
  294. row_count--;
  295. }
  296. row_count = output_ch & 0x3;
  297. while (row_count)
  298. {
  299. q15_t *col_pos = col_buffer_start + row_shift;
  300. const q7_t *row_pos = kernel + row_shift;
  301. q31_t sum = *bias++;
  302. const uint16_t col_count = (kernel_x * kernel_y);
  303. row_shift += 1;
  304. for (int i = 0; i < col_count; i++)
  305. {
  306. sum += row_pos[i * input_ch] * col_pos[i * input_ch];
  307. }
  308. sum = arm_nn_requantize(sum, *output_mult++, *output_shift++);
  309. sum += output_offset;
  310. sum = MAX(sum, output_activation_min);
  311. sum = MIN(sum, output_activation_max);
  312. *output++ = (q7_t)sum;
  313. row_count--;
  314. }
  315. // clear counter and pointers
  316. col_buffer = col_buffer_start;
  317. }
  318. }
  319. #else
  320. (void)buffer_a;
  321. /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
  322. return arm_depthwise_conv_s8(input,
  323. input_x,
  324. input_y,
  325. input_ch,
  326. kernel,
  327. output_ch,
  328. 1,
  329. kernel_x,
  330. kernel_y,
  331. pad_x,
  332. pad_y,
  333. stride_x,
  334. stride_y,
  335. bias,
  336. output,
  337. output_shift,
  338. output_mult,
  339. output_x,
  340. output_y,
  341. output_offset,
  342. input_offset,
  343. output_activation_min,
  344. output_activation_max,
  345. dilation_x,
  346. dilation_y,
  347. NULL);
  348. #endif /* ARM_MATH_MVEI | ARM_MATH_DSP */
  349. /* Return to application */
  350. return ARM_MATH_SUCCESS;
  351. }
  352. int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const uint16_t input_ch,
  353. const uint16_t kernel_x,
  354. const uint16_t kernel_y)
  355. {
  356. #if defined(ARM_MATH_MVEI)
  357. return (2 * input_ch * kernel_x * kernel_y) * sizeof(int16_t);
  358. #elif defined (ARM_MATH_DSP)
  359. return (input_ch * kernel_x * kernel_y) * sizeof(int16_t);
  360. #else
  361. (void)input_ch;
  362. (void)kernel_x;
  363. (void)kernel_y;
  364. return 0;
  365. #endif
  366. }
  367. /**
  368. * @} end of NNConv group
  369. */