arm_nn_depthwise_conv_nt_t_s8.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. /*
  2. * SPDX-FileCopyrightText: Copyright 2010-2020, 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_nn_depthwise_conv_nt_t_s8.c
  21. * Description: Depthwise convolution on matrices with no padding.
  22. *
  23. * $Date: 26 October 2022
  24. * $Revision: V.2.0.1
  25. *
  26. * Target Processor: Cortex-M processors with MVE extension.
  27. * -------------------------------------------------------------------- */
  28. #include "arm_nnsupportfunctions.h"
  29. /**
  30. * @ingroup groupSupport
  31. */
  32. /**
  33. * @addtogroup supportConvolution
  34. * @{
  35. */
  36. /*
  37. * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
  38. *
  39. * Refer header file for details.
  40. *
  41. */
  42. arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8(const int8_t *lhs,
  43. const int8_t *rhs,
  44. const int32_t input_offset,
  45. const int32_t active_ch,
  46. const int32_t total_ch,
  47. const int32_t *out_shift,
  48. const int32_t *out_mult,
  49. const int32_t out_offset,
  50. const int32_t activation_min,
  51. const int32_t activation_max,
  52. const uint16_t row_x_col,
  53. const int32_t *const output_bias,
  54. int8_t *out)
  55. {
  56. #if defined(ARM_MATH_MVEI)
  57. const int32_t *bias = output_bias;
  58. int32_t loop_count = (active_ch + 3) / 4;
  59. uint32_t num_ch_to_process = active_ch;
  60. for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
  61. num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
  62. {
  63. int32x4_t out_0 = vdupq_n_s32(0);
  64. if (bias)
  65. {
  66. out_0 = vldrwq_s32(bias);
  67. bias += 4;
  68. }
  69. int32x4_t out_1 = out_0;
  70. int32x4_t out_2 = out_0;
  71. int32x4_t out_3 = out_0;
  72. const int8_t *rhs_0 = rhs + offset;
  73. const int8_t *lhs_0 = lhs + offset;
  74. const int8_t *lhs_1 = lhs + row_x_col * CH_IN_BLOCK_MVE + offset;
  75. const int8_t *lhs_2 = lhs + (row_x_col * CH_IN_BLOCK_MVE * 2) + offset;
  76. const int8_t *lhs_3 = lhs + (row_x_col * CH_IN_BLOCK_MVE * 3) + offset;
  77. int32x4_t ker_sum = vdupq_n_s32(0);
  78. for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
  79. {
  80. const int32x4_t ker_0 = vldrbq_s32(rhs_0);
  81. ker_sum = vaddq_s32(ker_sum, ker_0);
  82. int32x4_t ip_0 = vldrbq_s32(lhs_0);
  83. out_0 += vmulq_s32(ip_0, ker_0);
  84. int32x4_t ip_1 = vldrbq_s32(lhs_1);
  85. out_1 += vmulq_s32(ip_1, ker_0);
  86. int32x4_t ip_2 = vldrbq_s32(lhs_2);
  87. out_2 += vmulq_s32(ip_2, ker_0);
  88. int32x4_t ip_3 = vldrbq_s32(lhs_3);
  89. out_3 += vmulq_s32(ip_3, ker_0);
  90. lhs_0 += CH_IN_BLOCK_MVE;
  91. lhs_1 += CH_IN_BLOCK_MVE;
  92. lhs_2 += CH_IN_BLOCK_MVE;
  93. lhs_3 += CH_IN_BLOCK_MVE;
  94. rhs_0 += total_ch;
  95. }
  96. ker_sum = vmulq_n_s32(ker_sum, input_offset);
  97. out_0 = ker_sum + out_0;
  98. out_1 = ker_sum + out_1;
  99. out_2 = ker_sum + out_2;
  100. out_3 = ker_sum + out_3;
  101. const int32x4_t mult = vldrwq_s32(out_mult);
  102. const int32x4_t shift = vldrwq_s32(out_shift);
  103. out_mult += 4;
  104. out_shift += 4;
  105. mve_pred16_t p = vctp32q(num_ch_to_process);
  106. out_0 = arm_requantize_mve_32x4(out_0, mult, shift);
  107. out_0 = vaddq_n_s32(out_0, out_offset);
  108. out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
  109. out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
  110. vstrbq_p_s32(out, out_0, p);
  111. out_1 = arm_requantize_mve_32x4(out_1, mult, shift);
  112. out_1 = vaddq_n_s32(out_1, out_offset);
  113. out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
  114. out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
  115. vstrbq_p_s32(out + total_ch, out_1, p);
  116. out_2 = arm_requantize_mve_32x4(out_2, mult, shift);
  117. out_2 = vaddq_n_s32(out_2, out_offset);
  118. out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
  119. out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
  120. vstrbq_p_s32(out + 2 * total_ch, out_2, p);
  121. out_3 = arm_requantize_mve_32x4(out_3, mult, shift);
  122. out_3 = vaddq_n_s32(out_3, out_offset);
  123. out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
  124. out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
  125. vstrbq_p_s32(out + 3 * total_ch, out_3, p);
  126. }
  127. return ARM_CMSIS_NN_SUCCESS;
  128. #else
  129. (void)lhs;
  130. (void)rhs;
  131. (void)input_offset;
  132. (void)active_ch;
  133. (void)total_ch;
  134. (void)out_shift;
  135. (void)out_mult;
  136. (void)out_offset;
  137. (void)activation_min;
  138. (void)activation_max;
  139. (void)row_x_col;
  140. (void)output_bias;
  141. (void)out;
  142. return ARM_CMSIS_NN_NO_IMPL_ERROR;
  143. #endif
  144. }
  145. /**
  146. * @} end of Doxygen group
  147. */