arm_elementwise_mul_s8.c 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. /*
  2. * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_elementwise_mul_s8
  21. * Description: Element wise multiplication
  22. *
  23. * $Date: August 2019
  24. * $Revision: V.1.0.0
  25. *
  26. * Target Processor: Cortex-M cores
  27. *
  28. * -------------------------------------------------------------------- */
  29. #include "arm_math.h"
  30. #include "arm_nnfunctions.h"
  31. #include "arm_nnsupportfunctions.h"
  32. /**
  33. * @ingroup groupNN
  34. */
  35. /**
  36. * @addtogroup BasicMath
  37. * @{
  38. */
  39. /**
  40. * @brief s8 element wise multiplication of two vectors
  41. *
  42. * @note Refer header file for details.
  43. *
  44. */
  45. arm_status
  46. arm_elementwise_mul_s8(const int8_t *input_1_vect,
  47. const int8_t *input_2_vect,
  48. const int32_t input_1_offset,
  49. const int32_t input_2_offset,
  50. int8_t *output,
  51. const int32_t out_offset,
  52. const int32_t out_mult,
  53. const int32_t out_shift,
  54. const int32_t out_activation_min,
  55. const int32_t out_activation_max,
  56. const uint32_t block_size)
  57. {
  58. uint32_t loop_count;
  59. int32_t input_1;
  60. int32_t input_2;
  61. int32_t mul_res;
  62. #if defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
  63. int32_t a_1, b_1, a_2, b_2;
  64. int32_t offset_1_packed, offset_2_packed;
  65. int8_t r1, r2, r3, r4;
  66. offset_1_packed = (input_1_offset << 16U) | (input_1_offset & 0x0FFFFL);
  67. offset_2_packed = (input_2_offset << 16U) | (input_2_offset & 0x0FFFFL);
  68. loop_count = block_size >> 2;
  69. while (loop_count > 0U)
  70. {
  71. /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
  72. intrinsic */
  73. input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
  74. input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
  75. a_1 = __SADD16(a_1, offset_1_packed);
  76. b_1 = __SADD16(b_1, offset_1_packed);
  77. a_2 = __SADD16(a_2, offset_2_packed);
  78. b_2 = __SADD16(b_2, offset_2_packed);
  79. /* Mul 1 */
  80. input_1 = (int16_t)(b_1 & 0x0FFFFL);
  81. input_2 = (int16_t)(b_2 & 0x0FFFFL);
  82. mul_res = input_1 * input_2;
  83. mul_res = arm_nn_divide_by_power_of_two(arm_nn_sat_doubling_high_mult(mul_res, out_mult), -out_shift) + out_offset;
  84. mul_res = MAX(mul_res, out_activation_min);
  85. mul_res = MIN(mul_res, out_activation_max);
  86. r1 = (q7_t)mul_res;
  87. /* Mul 3 */
  88. input_1 = (int16_t)((b_1 >> 16U) & 0x0FFFFL);
  89. input_2 = (int16_t)((b_2 >> 16U) & 0x0FFFFL);
  90. mul_res = input_1 * input_2;
  91. mul_res = arm_nn_divide_by_power_of_two(arm_nn_sat_doubling_high_mult(mul_res, out_mult), -out_shift) + out_offset;
  92. mul_res = MAX(mul_res, out_activation_min);
  93. mul_res = MIN(mul_res, out_activation_max);
  94. r3 = (q7_t)mul_res;
  95. /* Mul 2 */
  96. input_1 = (int16_t)(a_1 & 0x0FFFFL);
  97. input_2 = (int16_t)(a_2 & 0x0FFFFL);
  98. mul_res = input_1 * input_2;
  99. mul_res = arm_nn_divide_by_power_of_two(arm_nn_sat_doubling_high_mult(mul_res, out_mult), -out_shift) + out_offset;
  100. mul_res = MAX(mul_res, out_activation_min);
  101. mul_res = MIN(mul_res, out_activation_max);
  102. r2 = (q7_t)mul_res;
  103. /* Mul 4 */
  104. input_1 = (int16_t)((a_1 >> 16U) & 0x0FFFFL);
  105. input_2 = (int16_t)((a_2 >> 16U) & 0x0FFFFL);
  106. mul_res = input_1 * input_2;
  107. mul_res = arm_nn_divide_by_power_of_two(arm_nn_sat_doubling_high_mult(mul_res, out_mult), -out_shift) + out_offset;
  108. mul_res = MAX(mul_res, out_activation_min);
  109. mul_res = MIN(mul_res, out_activation_max);
  110. r4 = (q7_t)mul_res;
  111. write_q7x4_ia(&output, __PACKq7(r1, r2, r3, r4));
  112. loop_count--;
  113. }
  114. loop_count = block_size & 0x3;
  115. #else
  116. loop_count = block_size;
  117. #endif
  118. while (loop_count > 0U)
  119. {
  120. /* C = A * B */
  121. input_1 = *input_1_vect++ + input_1_offset;
  122. input_2 = *input_2_vect++ + input_2_offset;
  123. mul_res = input_1 * input_2;
  124. mul_res = arm_nn_divide_by_power_of_two(arm_nn_sat_doubling_high_mult(mul_res, out_mult), -out_shift) + out_offset;
  125. mul_res = MAX(mul_res, out_activation_min);
  126. mul_res = MIN(mul_res, out_activation_max);
  127. *output++ = (q7_t)mul_res;
  128. /* Decrement loop counter */
  129. loop_count--;
  130. }
  131. return (ARM_MATH_SUCCESS);
  132. }
  133. /**
  134. * @} end of BasicMath group
  135. */