arm_depthwise_conv_3x3_s8.c 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. /*
  2. * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_depthwise_conv_3x3_s8.c
  21. * Description: Optimized s8 depthwise convolution function for channel
  22. * multiplier of 1 and 3x3 kernel size.
  23. *
  24. * $Date: February 26, 2020
  25. * $Revision: V.1.0.0
  26. *
  27. * Target Processor: Cortex-M cores
  28. *
  29. * -------------------------------------------------------------------- */
  30. #include "arm_math.h"
  31. #include "arm_nnsupportfunctions.h"
  32. #include "arm_nnfunctions.h"
  33. /**
  34. * @ingroup groupNN
  35. */
  36. /**
  37. * @addtogroup NNConv
  38. * @{
  39. */
  40. /*
  41. * Optimized s8 depthwise convolution function with constraint that
  42. * in_channel == out_channel and kernel_x == kernel_y == 3 with pads at most 1
  43. *
  44. * Refer prototype header file for details.
  45. *
  46. */
  47. arm_status arm_depthwise_conv_3x3_s8(const int8_t *input,
  48. const int32_t input_x,
  49. const int32_t input_y,
  50. const int32_t input_ch,
  51. const int8_t *kernel,
  52. const int32_t output_ch,
  53. const int32_t pad_x,
  54. const int32_t pad_y,
  55. const int32_t stride_x,
  56. const int32_t stride_y,
  57. const int32_t *bias,
  58. int8_t *output,
  59. const int32_t *output_shift,
  60. const int32_t *output_mult,
  61. const int32_t output_x,
  62. const int32_t output_y,
  63. const int32_t output_offset,
  64. const int32_t input_offset,
  65. const int32_t output_activation_min,
  66. const int32_t output_activation_max,
  67. const int32_t dilation_x,
  68. const int32_t dilation_y,
  69. int16_t *buffer_a)
  70. {
  71. /* Check input constraints input_ch == output_ch */
  72. if (input_ch != output_ch)
  73. {
  74. return ARM_MATH_SIZE_MISMATCH;
  75. }
  76. /* Check input constraints pad_x <= 1 */
  77. if(pad_x > 1)
  78. {
  79. return ARM_MATH_ARGUMENT_ERROR;
  80. }
  81. (void)dilation_x;
  82. (void)dilation_y;
  83. (void)buffer_a;
  84. for(int32_t in_h = -pad_y, out_h = 0, out_idx = 0; out_h < output_y; in_h += stride_y, ++out_h)
  85. {
  86. for(int32_t in_w = -pad_x, out_w = 0, ker_h_start = MAX(0, -in_h); out_w < output_x; in_w += stride_x, ++out_w)
  87. {
  88. int32_t in_ch = 0;
  89. int32_t ker_w_start = MAX(0, -in_w);
  90. for(; in_ch <= (input_ch - 4); in_ch += 4)
  91. {
  92. int32_t out_buff0 = bias[in_ch + 0];
  93. int32_t out_buff1 = bias[in_ch + 1];
  94. int32_t out_buff2 = bias[in_ch + 2];
  95. int32_t out_buff3 = bias[in_ch + 3];
  96. const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
  97. const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
  98. for(int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
  99. {
  100. int32_t in_val = 0;
  101. int32_t ker_val = 0;
  102. if(ker_w_start == 0)
  103. {
  104. in_val = arm_nn_read_q7x4(input_ptr);
  105. ker_val = arm_nn_read_q7x4(kernel_ptr);
  106. out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
  107. out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
  108. out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
  109. out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
  110. }
  111. in_val = arm_nn_read_q7x4(input_ptr + input_ch);
  112. ker_val = arm_nn_read_q7x4(kernel_ptr + input_ch);
  113. out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
  114. out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
  115. out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
  116. out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
  117. if((input_x - in_w) >= 3)
  118. {
  119. in_val = arm_nn_read_q7x4(input_ptr + (input_ch << 1));
  120. ker_val = arm_nn_read_q7x4(kernel_ptr + (input_ch << 1));
  121. out_buff0 += ((int8_t)in_val + input_offset) * (int8_t)ker_val;
  122. out_buff1 += ((int8_t)(in_val >> 8) + input_offset) * (int8_t)(ker_val >> 8);
  123. out_buff2 += ((int8_t)(in_val >> 16) + input_offset) * (int8_t)(ker_val >> 16);
  124. out_buff3 += ((int8_t)(in_val >> 24) + input_offset) * (int8_t)(ker_val >> 24);
  125. }
  126. input_ptr += (input_ch * input_x);
  127. kernel_ptr += (input_ch * 3);
  128. }
  129. out_buff0 = arm_nn_requantize(out_buff0, output_mult[in_ch + 0], output_shift[in_ch + 0]);
  130. out_buff1 = arm_nn_requantize(out_buff1, output_mult[in_ch + 1], output_shift[in_ch + 1]);
  131. out_buff2 = arm_nn_requantize(out_buff2, output_mult[in_ch + 2], output_shift[in_ch + 2]);
  132. out_buff3 = arm_nn_requantize(out_buff3, output_mult[in_ch + 3], output_shift[in_ch + 3]);
  133. out_buff0 += output_offset;
  134. out_buff1 += output_offset;
  135. out_buff2 += output_offset;
  136. out_buff3 += output_offset;
  137. out_buff0 = MIN(MAX(out_buff0, output_activation_min), output_activation_max);
  138. out_buff1 = MIN(MAX(out_buff1, output_activation_min), output_activation_max);
  139. out_buff2 = MIN(MAX(out_buff2, output_activation_min), output_activation_max);
  140. out_buff3 = MIN(MAX(out_buff3, output_activation_min), output_activation_max);
  141. output[out_idx++] = (int8_t)out_buff0;
  142. output[out_idx++] = (int8_t)out_buff1;
  143. output[out_idx++] = (int8_t)out_buff2;
  144. output[out_idx++] = (int8_t)out_buff3;
  145. }
  146. // Leftover
  147. for(; in_ch < input_ch; ++in_ch)
  148. {
  149. int32_t out_buff = bias[in_ch];
  150. const int8_t *input_ptr = input + (in_h + ker_h_start) * (input_ch * input_x) + in_w * input_ch + in_ch;
  151. const int8_t *kernel_ptr = kernel + ker_h_start * (input_ch * 3) + in_ch;
  152. for(int32_t ker_h = ker_h_start; ker_h < MIN(3, input_y - in_h); ++ker_h)
  153. {
  154. if(ker_w_start == 0)
  155. {
  156. out_buff += (*(input_ptr) + input_offset) * *(kernel_ptr);
  157. }
  158. out_buff += (*(input_ptr + input_ch) + input_offset) * *(kernel_ptr + input_ch);
  159. if((input_x - in_w) >= 3)
  160. {
  161. out_buff += (*(input_ptr + (input_ch << 1)) + input_offset) * *(kernel_ptr + (input_ch << 1));
  162. }
  163. input_ptr += (input_ch * input_x);
  164. kernel_ptr += (input_ch * 3);
  165. }
  166. out_buff = arm_nn_requantize(out_buff, output_mult[in_ch], output_shift[in_ch]);
  167. out_buff += output_offset;
  168. out_buff = MIN(MAX(out_buff, output_activation_min), output_activation_max);
  169. output[out_idx++] = (int8_t)out_buff;
  170. }
  171. }
  172. }
  173. /* Return to application */
  174. return ARM_MATH_SUCCESS;
  175. }
  176. /**
  177. * @} end of NNConv group
  178. */