arm_mat_mult_fast_q31.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. /* ----------------------------------------------------------------------
  2. * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
  3. *
  4. * $Date: 26. October 2016
  5. * $Revision: V.1.4.5 a
  6. *
  7. * Project: CMSIS DSP Library
  8. * Title: arm_mat_mult_fast_q31.c
  9. *
  10. * Description: Q31 matrix multiplication (fast variant).
  11. *
  12. * Target Processor: Cortex-M4/Cortex-M3
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions
  16. * are met:
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright
  20. * notice, this list of conditions and the following disclaimer in
  21. * the documentation and/or other materials provided with the
  22. * distribution.
  23. * - Neither the name of ARM LIMITED nor the names of its contributors
  24. * may be used to endorse or promote products derived from this
  25. * software without specific prior written permission.
  26. *
  27. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  28. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  29. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  30. * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  31. * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  32. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  33. * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  34. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  35. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  36. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  37. * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  38. * POSSIBILITY OF SUCH DAMAGE.
  39. * -------------------------------------------------------------------- */
  40. #include "arm_math.h"
  41. /**
  42. * @ingroup groupMatrix
  43. */
  44. /**
  45. * @addtogroup MatrixMult
  46. * @{
  47. */
  48. /**
  49. * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
  50. * @param[in] *pSrcA points to the first input matrix structure
  51. * @param[in] *pSrcB points to the second input matrix structure
  52. * @param[out] *pDst points to output matrix structure
  53. * @return The function returns either
  54. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  55. *
  56. * @details
  57. * <b>Scaling and Overflow Behavior:</b>
  58. *
  59. * \par
  60. * The difference between the function arm_mat_mult_q31() and this fast variant is that
  61. * the fast variant use a 32-bit rather than a 64-bit accumulator.
  62. * The result of each 1.31 x 1.31 multiplication is truncated to
  63. * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
  64. * format. Finally, the accumulator is saturated and converted to a 1.31 result.
  65. *
  66. * \par
  67. * The fast version has the same overflow behavior as the standard version but provides
  68. * less precision since it discards the low 32 bits of each multiplication result.
  69. * In order to avoid overflows completely the input signals must be scaled down.
  70. * Scale down one of the input matrices by log2(numColsA) bits to
  71. * avoid overflows, as a total of numColsA additions are computed internally for each
  72. * output element.
  73. *
  74. * \par
  75. * See <code>arm_mat_mult_q31()</code> for a slower implementation of this function
  76. * which uses 64-bit accumulation to provide higher precision.
  77. */
  78. arm_status arm_mat_mult_fast_q31(
  79. const arm_matrix_instance_q31 * pSrcA,
  80. const arm_matrix_instance_q31 * pSrcB,
  81. arm_matrix_instance_q31 * pDst)
  82. {
  83. q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  84. q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
  85. q31_t *px; /* Temporary output data matrix pointer */
  86. q31_t sum; /* Accumulator */
  87. uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
  88. uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
  89. uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
  90. uint32_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
  91. arm_status status; /* status of matrix multiplication */
  92. q31_t inA1, inB1;
  93. #ifndef ARM_MATH_CM0_FAMILY
  94. q31_t sum2, sum3, sum4;
  95. q31_t inA2, inB2;
  96. q31_t *pInA2;
  97. q31_t *px2;
  98. #endif
  99. #ifdef ARM_MATH_MATRIX_CHECK
  100. /* Check for matrix mismatch condition */
  101. if((pSrcA->numCols != pSrcB->numRows) ||
  102. (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
  103. {
  104. /* Set status as ARM_MATH_SIZE_MISMATCH */
  105. status = ARM_MATH_SIZE_MISMATCH;
  106. }
  107. else
  108. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  109. {
  110. px = pDst->pData;
  111. #ifndef ARM_MATH_CM0_FAMILY
  112. row = row >> 1;
  113. px2 = px + numColsB;
  114. #endif
  115. /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  116. /* row loop */
  117. while(row > 0u)
  118. {
  119. /* For every row wise process, the column loop counter is to be initiated */
  120. col = numColsB;
  121. /* For every row wise process, the pIn2 pointer is set
  122. ** to the starting address of the pSrcB data */
  123. pInB = pSrcB->pData;
  124. j = 0u;
  125. #ifndef ARM_MATH_CM0_FAMILY
  126. col = col >> 1;
  127. #endif
  128. /* column loop */
  129. while (col > 0u)
  130. {
  131. /* Set the variable sum, that acts as accumulator, to zero */
  132. sum = 0;
  133. /* Initiate data pointers */
  134. pInA = pSrcA->pData + i;
  135. pInB = pSrcB->pData + j;
  136. #ifndef ARM_MATH_CM0_FAMILY
  137. sum2 = 0;
  138. sum3 = 0;
  139. sum4 = 0;
  140. pInA2 = pInA + numColsA;
  141. colCnt = numColsA;
  142. #else
  143. colCnt = numColsA >> 2;
  144. #endif
  145. /* matrix multiplication */
  146. while(colCnt > 0u)
  147. {
  148. #ifndef ARM_MATH_CM0_FAMILY
  149. inA1 = *pInA++;
  150. inB1 = pInB[0];
  151. inA2 = *pInA2++;
  152. inB2 = pInB[1];
  153. pInB += numColsB;
  154. sum = __SMMLA(inA1, inB1, sum);
  155. sum2 = __SMMLA(inA1, inB2, sum2);
  156. sum3 = __SMMLA(inA2, inB1, sum3);
  157. sum4 = __SMMLA(inA2, inB2, sum4);
  158. #else
  159. /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  160. /* Perform the multiply-accumulates */
  161. inB1 = *pInB;
  162. pInB += numColsB;
  163. inA1 = pInA[0];
  164. sum = __SMMLA(inA1, inB1, sum);
  165. inB1 = *pInB;
  166. pInB += numColsB;
  167. inA1 = pInA[1];
  168. sum = __SMMLA(inA1, inB1, sum);
  169. inB1 = *pInB;
  170. pInB += numColsB;
  171. inA1 = pInA[2];
  172. sum = __SMMLA(inA1, inB1, sum);
  173. inB1 = *pInB;
  174. pInB += numColsB;
  175. inA1 = pInA[3];
  176. sum = __SMMLA(inA1, inB1, sum);
  177. pInA += 4u;
  178. #endif
  179. /* Decrement the loop counter */
  180. colCnt--;
  181. }
  182. #ifdef ARM_MATH_CM0_FAMILY
  183. /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here. */
  184. colCnt = numColsA % 0x4u;
  185. while(colCnt > 0u)
  186. {
  187. sum = __SMMLA(*pInA++, *pInB, sum);
  188. pInB += numColsB;
  189. colCnt--;
  190. }
  191. j++;
  192. #endif
  193. /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
  194. *px++ = sum << 1;
  195. #ifndef ARM_MATH_CM0_FAMILY
  196. *px++ = sum2 << 1;
  197. *px2++ = sum3 << 1;
  198. *px2++ = sum4 << 1;
  199. j += 2;
  200. #endif
  201. /* Decrement the column loop counter */
  202. col--;
  203. }
  204. i = i + numColsA;
  205. #ifndef ARM_MATH_CM0_FAMILY
  206. i = i + numColsA;
  207. px = px2 + (numColsB & 1u);
  208. px2 = px + numColsB;
  209. #endif
  210. /* Decrement the row loop counter */
  211. row--;
  212. }
  213. /* Compute any remaining odd row/column below */
  214. #ifndef ARM_MATH_CM0_FAMILY
  215. /* Compute remaining output column */
  216. if (numColsB & 1u) {
  217. /* Avoid redundant computation of last element */
  218. row = numRowsA & (~0x1);
  219. /* Point to remaining unfilled column in output matrix */
  220. px = pDst->pData+numColsB-1;
  221. pInA = pSrcA->pData;
  222. /* row loop */
  223. while (row > 0)
  224. {
  225. /* point to last column in matrix B */
  226. pInB = pSrcB->pData + numColsB-1;
  227. /* Set the variable sum, that acts as accumulator, to zero */
  228. sum = 0;
  229. /* Compute 4 columns at once */
  230. colCnt = numColsA >> 2;
  231. /* matrix multiplication */
  232. while(colCnt > 0u)
  233. {
  234. inA1 = *pInA++;
  235. inA2 = *pInA++;
  236. inB1 = *pInB;
  237. pInB += numColsB;
  238. inB2 = *pInB;
  239. pInB += numColsB;
  240. sum = __SMMLA(inA1, inB1, sum);
  241. sum = __SMMLA(inA2, inB2, sum);
  242. inA1 = *pInA++;
  243. inA2 = *pInA++;
  244. inB1 = *pInB;
  245. pInB += numColsB;
  246. inB2 = *pInB;
  247. pInB += numColsB;
  248. sum = __SMMLA(inA1, inB1, sum);
  249. sum = __SMMLA(inA2, inB2, sum);
  250. /* Decrement the loop counter */
  251. colCnt--;
  252. }
  253. colCnt = numColsA & 3u;
  254. while(colCnt > 0u) {
  255. sum = __SMMLA(*pInA++, *pInB, sum);
  256. pInB += numColsB;
  257. colCnt--;
  258. }
  259. /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
  260. *px = sum << 1;
  261. px += numColsB;
  262. /* Decrement the row loop counter */
  263. row--;
  264. }
  265. }
  266. /* Compute remaining output row */
  267. if (numRowsA & 1u) {
  268. /* point to last row in output matrix */
  269. px = pDst->pData+(numColsB)*(numRowsA-1);
  270. col = numColsB;
  271. i = 0u;
  272. /* col loop */
  273. while (col > 0)
  274. {
  275. /* point to last row in matrix A */
  276. pInA = pSrcA->pData + (numRowsA-1)*numColsA;
  277. pInB = pSrcB->pData + i;
  278. /* Set the variable sum, that acts as accumulator, to zero */
  279. sum = 0;
  280. /* Compute 4 columns at once */
  281. colCnt = numColsA >> 2;
  282. /* matrix multiplication */
  283. while(colCnt > 0u)
  284. {
  285. inA1 = *pInA++;
  286. inA2 = *pInA++;
  287. inB1 = *pInB;
  288. pInB += numColsB;
  289. inB2 = *pInB;
  290. pInB += numColsB;
  291. sum = __SMMLA(inA1, inB1, sum);
  292. sum = __SMMLA(inA2, inB2, sum);
  293. inA1 = *pInA++;
  294. inA2 = *pInA++;
  295. inB1 = *pInB;
  296. pInB += numColsB;
  297. inB2 = *pInB;
  298. pInB += numColsB;
  299. sum = __SMMLA(inA1, inB1, sum);
  300. sum = __SMMLA(inA2, inB2, sum);
  301. /* Decrement the loop counter */
  302. colCnt--;
  303. }
  304. colCnt = numColsA & 3u;
  305. while(colCnt > 0u) {
  306. sum = __SMMLA(*pInA++, *pInB, sum);
  307. pInB += numColsB;
  308. colCnt--;
  309. }
  310. /* Saturate and store the result in the destination buffer */
  311. *px++ = sum << 1;
  312. i++;
  313. /* Decrement the col loop counter */
  314. col--;
  315. }
  316. }
  317. #endif /* #ifndef ARM_MATH_CM0_FAMILY */
  318. /* set status as ARM_MATH_SUCCESS */
  319. status = ARM_MATH_SUCCESS;
  320. }
  321. /* Return to application */
  322. return (status);
  323. }
  324. /**
  325. * @} end of MatrixMult group
  326. */