arm_max_pool_s8_opt.c 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. /*
  2. * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0dim_dst_width
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_max_pool_s8_opt.c
  21. * Description: Pooling function implementations
  22. *
  23. * $Date: February 27, 2020
  24. * $Revision: V.1.0.1
  25. *
  26. * Target Processor: Cortex-M
  27. *
  28. * -------------------------------------------------------------------- */
  29. #include "arm_math.h"
  30. #include "arm_nnfunctions.h"
  31. #if defined(ARM_MATH_DSP)
  32. static void compare_and_replace_if_larger_q7(q7_t *base,
  33. const q7_t *target,
  34. const uint16_t length)
  35. {
  36. #if defined(ARM_MATH_MVEI)
  37. int loop_count = length / 16;
  38. while (loop_count)
  39. {
  40. const int8x16_t op_1 = vldrbq_s8(base);
  41. const int8x16_t op_2 = vldrbq_s8(target);
  42. const int8x16_t max = vmaxq_s8(op_1, op_2);
  43. vstrbq_s8(base, max);
  44. base += 16;
  45. target += 16;
  46. loop_count--;
  47. }
  48. if (((length & 0xF) / 8) > 0)
  49. {
  50. const int16x8_t op_1 = vldrbq_s16(base);
  51. const int16x8_t op_2 = vldrbq_s16(target);
  52. const int16x8_t max = vmaxq_s16(op_1, op_2);
  53. vstrbq_s16(base, max);
  54. base += 8;
  55. target += 8;
  56. }
  57. for (int i = 0; i < (length & 7); i++)
  58. {
  59. if (target[i] > base[i])
  60. {
  61. base[i] = target[i];
  62. }
  63. }
  64. #else
  65. q7_t *dst = base;
  66. const q7_t *src = target;
  67. union arm_nnword ref_max;
  68. union arm_nnword comp_max;
  69. int32_t cnt = length >> 2;
  70. while (cnt > 0l)
  71. {
  72. ref_max.word = arm_nn_read_q7x4(dst);
  73. comp_max.word = arm_nn_read_q7x4_ia(&src);
  74. if (comp_max.bytes[0] > ref_max.bytes[0])
  75. {
  76. ref_max.bytes[0] = comp_max.bytes[0];
  77. }
  78. if (comp_max.bytes[1] > ref_max.bytes[1])
  79. {
  80. ref_max.bytes[1] = comp_max.bytes[1];
  81. }
  82. if (comp_max.bytes[2] > ref_max.bytes[2])
  83. {
  84. ref_max.bytes[2] = comp_max.bytes[2];
  85. }
  86. if (comp_max.bytes[3] > ref_max.bytes[3])
  87. {
  88. ref_max.bytes[3] = comp_max.bytes[3];
  89. }
  90. write_q7x4_ia(&dst, ref_max.word);
  91. cnt--;
  92. }
  93. cnt = length & 0x3;
  94. while (cnt > 0l)
  95. {
  96. if (*src > *dst)
  97. {
  98. *dst = *src;
  99. }
  100. dst++;
  101. src++;
  102. cnt--;
  103. }
  104. #endif
  105. }
  106. static void clamp_output(q7_t *source, const uint16_t length, const int32_t act_min, const int32_t act_max)
  107. {
  108. #if defined(ARM_MATH_MVEI)
  109. int cnt = length / 16;
  110. while (cnt > 0)
  111. {
  112. const int8x16_t src = vldrbq_s8(source);
  113. int8x16_t res = vmaxq_s8(src, vdupq_n_s8((int8_t)act_min));
  114. res = vminq_s8(src, vdupq_n_s8((int8_t)act_max));
  115. vstrbq_s8(source, res);
  116. source += 16;
  117. cnt--;
  118. }
  119. if (((length & 0xF) / 8) > 0)
  120. {
  121. const int16x8_t src = vldrbq_s16(source);
  122. int16x8_t res = vmaxq_s16(src, vdupq_n_s16((int16_t)act_min));
  123. res = vminq_s16(src, vdupq_n_s16((int16_t)act_max));
  124. vstrbq_s16(source, res);
  125. source += 8;
  126. }
  127. cnt = length & 7;
  128. while (cnt > 0)
  129. {
  130. int32_t comp = *source;
  131. comp = MAX(comp, act_min);
  132. comp = MIN(comp, act_max);
  133. *source++ = (int8_t)comp;
  134. cnt--;
  135. }
  136. #else
  137. union arm_nnword in;
  138. int32_t cnt = length >> 2;
  139. while (cnt > 0l)
  140. {
  141. in.word = arm_nn_read_q7x4(source);
  142. in.bytes[0] = MAX(in.bytes[0], act_min);
  143. in.bytes[0] = MIN(in.bytes[0], act_max);
  144. in.bytes[1] = MAX(in.bytes[1], act_min);
  145. in.bytes[1] = MIN(in.bytes[1], act_max);
  146. in.bytes[2] = MAX(in.bytes[2], act_min);
  147. in.bytes[2] = MIN(in.bytes[2], act_max);
  148. in.bytes[3] = MAX(in.bytes[3], act_min);
  149. in.bytes[3] = MIN(in.bytes[3], act_max);
  150. write_q7x4_ia(&source, in.word);
  151. cnt--;
  152. }
  153. cnt = length & 0x3;
  154. while (cnt > 0l)
  155. {
  156. int32_t comp = *source;
  157. comp = MAX(comp, act_min);
  158. comp = MIN(comp, act_max);
  159. *source++ = (int8_t)comp;
  160. cnt--;
  161. }
  162. #endif
  163. }
  164. #endif
  165. /**
  166. * @ingroup groupNN
  167. */
  168. /**
  169. * @addtogroup Pooling
  170. * @{
  171. */
  172. /*
  173. * Optimized s8 max pooling function
  174. *
  175. * Refer to header file for details.
  176. *
  177. */
  178. arm_status arm_max_pool_s8_opt(const uint16_t input_y,
  179. const uint16_t input_x,
  180. const uint16_t output_y,
  181. const uint16_t output_x,
  182. const uint16_t stride_y,
  183. const uint16_t stride_x,
  184. const uint16_t kernel_y,
  185. const uint16_t kernel_x,
  186. const uint16_t pad_y,
  187. const uint16_t pad_x,
  188. const int8_t act_min,
  189. const int8_t act_max,
  190. const uint16_t depth,
  191. int8_t *src,
  192. int16_t *tmp_buffer,
  193. int8_t *dst)
  194. {
  195. #if defined(ARM_MATH_DSP)
  196. /* Run the following code for Cortex-M4 and Cortex-M7 */
  197. (void)tmp_buffer;
  198. int32_t i_x, i_y;
  199. /* first does the pooling along x axis */
  200. for (i_y = 0; i_y < input_y; i_y++)
  201. {
  202. for (i_x = 0; i_x < output_x; i_x++)
  203. {
  204. /* for each output sample */
  205. q7_t *target = src + (i_y * input_x + i_x) * depth;
  206. q7_t *win_start;
  207. q7_t *win_stop;
  208. const int32_t x_origin = i_x * stride_x - pad_x;
  209. if (x_origin < 0)
  210. {
  211. win_start = target;
  212. }
  213. else
  214. {
  215. win_start = src + (i_y * input_x + x_origin) * depth;
  216. }
  217. if (x_origin + kernel_x >= input_x)
  218. {
  219. win_stop = src + (i_y * input_x + input_x) * depth;
  220. }
  221. else
  222. {
  223. win_stop = src + (i_y * input_x + x_origin + kernel_x) * depth;
  224. }
  225. /* first step is to copy over initial data(along channel) along the channel in x direction */
  226. memmove(target, win_start, depth);
  227. /* Move over to next element along x axis and compare with the base(target) */
  228. win_start += depth;
  229. for (; win_start < win_stop; win_start += depth)
  230. {
  231. compare_and_replace_if_larger_q7(target, win_start, depth);
  232. }
  233. }
  234. }
  235. /* then does the pooling along y axis */
  236. for (i_y = 0; i_y < output_y; i_y++)
  237. {
  238. /* for each output row */
  239. q7_t *target = dst + i_y * output_x * depth;
  240. q7_t *row_start;
  241. q7_t *row_end;
  242. const int32_t y_origin = i_y * stride_y - pad_y;
  243. /* setting the starting row */
  244. if (y_origin < 0)
  245. {
  246. row_start = src;
  247. }
  248. else
  249. {
  250. row_start = src + y_origin * input_x * depth;
  251. }
  252. /* setting the stopping row */
  253. if (y_origin + kernel_y >= input_y)
  254. {
  255. row_end = src + input_y * input_x * depth;
  256. }
  257. else
  258. {
  259. row_end = src + (y_origin + kernel_y) * input_x * depth;
  260. }
  261. /* copy over the complete first row. */
  262. memmove(target, row_start, output_x * depth);
  263. /* move over to next row and compare with the base row (target)*/
  264. row_start += depth * input_x;
  265. for (; row_start < row_end; row_start += input_x * depth)
  266. {
  267. compare_and_replace_if_larger_q7(target, row_start, output_x * depth);
  268. }
  269. }
  270. clamp_output(dst, output_x * output_y * depth, act_min, act_max);
  271. #else
  272. /* Pure C implementation */
  273. arm_max_pool_s8(input_y,
  274. input_x,
  275. output_y,
  276. output_x,
  277. stride_y,
  278. stride_x,
  279. kernel_y,
  280. kernel_x,
  281. pad_y,
  282. pad_x,
  283. act_min,
  284. act_max,
  285. depth,
  286. src,
  287. tmp_buffer,
  288. dst);
  289. #endif
  290. return ARM_MATH_SUCCESS;
  291. }
  292. /**
  293. * @} end of Pooling group
  294. */