arm_mat_mult_q7.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_mat_mult_q7.c
  4. * Description: Q15 matrix multiplication
  5. *
  6. * $Date: 06. July 2020
  7. *
  8. * Target Processor: Cortex-M cores
  9. * -------------------------------------------------------------------- */
  10. /*
  11. * Copyright (C) 2010-2020 ARM Limited or its affiliates. All rights reserved.
  12. *
  13. * SPDX-License-Identifier: Apache-2.0
  14. *
  15. * Licensed under the Apache License, Version 2.0 (the License); you may
  16. * not use this file except in compliance with the License.
  17. * You may obtain a copy of the License at
  18. *
  19. * www.apache.org/licenses/LICENSE-2.0
  20. *
  21. * Unless required by applicable law or agreed to in writing, software
  22. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  23. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  24. * See the License for the specific language governing permissions and
  25. * limitations under the License.
  26. */
  27. #include "dsp/matrix_functions.h"
  28. /**
  29. @ingroup groupMatrix
  30. */
  31. /**
  32. @addtogroup MatrixMult
  33. @{
  34. */
  35. /**
  36. * @brief Q7 matrix multiplication
  37. * @param[in] *pSrcA points to the first input matrix structure
  38. * @param[in] *pSrcB points to the second input matrix structure
  39. * @param[out] *pDst points to output matrix structure
  40. * @param[in] *pState points to the array for storing intermediate results (Unused in some versions)
  41. * @return The function returns either
  42. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  43. *
  44. * @details
  45. * <b>Scaling and Overflow Behavior:</b>
  46. *
  47. * \par
  48. * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
  49. *
  50. *
  51. */
  52. #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
  53. __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
  54. const arm_matrix_instance_q7 * pSrcA,
  55. const arm_matrix_instance_q7 * pSrcB,
  56. arm_matrix_instance_q7 * pDst)
  57. {
  58. const uint32_t MATRIX_DIM = 2;
  59. q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
  60. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  61. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  62. uint8x16_t vecColBOffs;
  63. q7_t *pInA0 = pInA;
  64. q7_t *pInA1 = pInA0 + MATRIX_DIM;
  65. q31_t acc0, acc1;
  66. q7x16_t vecB, vecA0, vecA1;
  67. mve_pred16_t p0 = vctp8q(MATRIX_DIM);
  68. vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
  69. pInB = pSrcB->pData;
  70. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  71. vecA0 = vldrbq_s8(pInA0);
  72. vecA1 = vldrbq_s8(pInA1);
  73. acc0 = vmladavq_s8(vecA0, vecB);
  74. acc1 = vmladavq_s8(vecA1, vecB);
  75. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  76. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  77. pOut++;
  78. /* move to next B column */
  79. pInB = pInB + 1;
  80. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  81. acc0 = vmladavq_s8(vecA0, vecB);
  82. acc1 = vmladavq_s8(vecA1, vecB);
  83. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  84. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  85. /*
  86. * Return to application
  87. */
  88. return (ARM_MATH_SUCCESS);
  89. }
  90. __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
  91. const arm_matrix_instance_q7 * pSrcA,
  92. const arm_matrix_instance_q7 * pSrcB,
  93. arm_matrix_instance_q7 * pDst)
  94. {
  95. const uint8_t MATRIX_DIM = 3;
  96. q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
  97. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  98. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  99. uint8x16_t vecColBOffs;
  100. q7_t *pInA0 = pInA;
  101. q7_t *pInA1 = pInA0 + MATRIX_DIM;
  102. q7_t *pInA2 = pInA1 + MATRIX_DIM;
  103. q31_t acc0, acc1, acc2;
  104. q7x16_t vecB, vecA0, vecA1, vecA2;
  105. mve_pred16_t p0 = vctp8q(MATRIX_DIM);
  106. vecColBOffs = vidupq_u8((uint32_t)0, 1);
  107. vecColBOffs = vecColBOffs * MATRIX_DIM;
  108. pInB = pSrcB->pData;
  109. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  110. vecA0 = vldrbq_s8(pInA0);
  111. vecA1 = vldrbq_s8(pInA1);
  112. vecA2 = vldrbq_s8(pInA2);
  113. acc0 = vmladavq_s8(vecA0, vecB);
  114. acc1 = vmladavq_s8(vecA1, vecB);
  115. acc2 = vmladavq_s8(vecA2, vecB);
  116. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  117. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  118. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  119. pOut++;
  120. /* move to next B column */
  121. pInB = pInB + 1;
  122. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  123. acc0 = vmladavq_s8(vecA0, vecB);
  124. acc1 = vmladavq_s8(vecA1, vecB);
  125. acc2 = vmladavq_s8(vecA2, vecB);
  126. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  127. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  128. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  129. pOut++;
  130. /* move to next B column */
  131. pInB = pInB + 1;
  132. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  133. acc0 = vmladavq_s8(vecA0, vecB);
  134. acc1 = vmladavq_s8(vecA1, vecB);
  135. acc2 = vmladavq_s8(vecA2, vecB);
  136. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  137. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  138. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  139. /*
  140. * Return to application
  141. */
  142. return (ARM_MATH_SUCCESS);
  143. }
  144. __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
  145. const arm_matrix_instance_q7 * pSrcA,
  146. const arm_matrix_instance_q7 * pSrcB,
  147. arm_matrix_instance_q7 * pDst)
  148. {
  149. const uint32_t MATRIX_DIM = 4;
  150. q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
  151. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  152. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  153. uint8x16_t vecColBOffs;
  154. q7_t *pInA0 = pInA;
  155. q7_t *pInA1 = pInA0 + MATRIX_DIM;
  156. q7_t *pInA2 = pInA1 + MATRIX_DIM;
  157. q7_t *pInA3 = pInA2 + MATRIX_DIM;
  158. q31_t acc0, acc1, acc2, acc3;
  159. q7x16_t vecB, vecA0, vecA1, vecA2, vecA3;
  160. mve_pred16_t p0 = vctp8q(MATRIX_DIM);
  161. vecColBOffs = vidupq_u8((uint32_t)0, 4);
  162. pInB = pSrcB->pData;
  163. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  164. vecA0 = vldrbq_s8(pInA0);
  165. vecA1 = vldrbq_s8(pInA1);
  166. vecA2 = vldrbq_s8(pInA2);
  167. vecA3 = vldrbq_s8(pInA3);
  168. acc0 = vmladavq_s8(vecA0, vecB);
  169. acc1 = vmladavq_s8(vecA1, vecB);
  170. acc2 = vmladavq_s8(vecA2, vecB);
  171. acc3 = vmladavq_s8(vecA3, vecB);
  172. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  173. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  174. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  175. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  176. pOut++;
  177. /* move to next B column */
  178. pInB = pInB + 1;
  179. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  180. acc0 = vmladavq_s8(vecA0, vecB);
  181. acc1 = vmladavq_s8(vecA1, vecB);
  182. acc2 = vmladavq_s8(vecA2, vecB);
  183. acc3 = vmladavq_s8(vecA3, vecB);
  184. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  185. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  186. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  187. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  188. pOut++;
  189. /* move to next B column */
  190. pInB = pInB + 1;
  191. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  192. acc0 = vmladavq_s8(vecA0, vecB);
  193. acc1 = vmladavq_s8(vecA1, vecB);
  194. acc2 = vmladavq_s8(vecA2, vecB);
  195. acc3 = vmladavq_s8(vecA3, vecB);
  196. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  197. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  198. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  199. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  200. pOut++;
  201. /* move to next B column */
  202. pInB = pInB + 1;
  203. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  204. acc0 = vmladavq_s8(vecA0, vecB);
  205. acc1 = vmladavq_s8(vecA1, vecB);
  206. acc2 = vmladavq_s8(vecA2, vecB);
  207. acc3 = vmladavq_s8(vecA3, vecB);
  208. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  209. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  210. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  211. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  212. /*
  213. * Return to application
  214. */
  215. return (ARM_MATH_SUCCESS);
  216. }
  217. arm_status arm_mat_mult_q7(
  218. const arm_matrix_instance_q7 * pSrcA,
  219. const arm_matrix_instance_q7 * pSrcB,
  220. arm_matrix_instance_q7 * pDst,
  221. q7_t * pState)
  222. {
  223. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
  224. q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
  225. q7_t *pInA2;
  226. q7_t *pInB2;
  227. q7_t *px; /* Temporary output data matrix pointer */
  228. q7_t *px2; /* Temporary output data matrix pointer */
  229. uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
  230. uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
  231. uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
  232. uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
  233. uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
  234. q7_t *pSrcBT = pState; /* input data matrix pointer for transpose */
  235. uint32_t blkCnt; /* loop counters */
  236. arm_status status; /* status of matrix multiplication */
  237. arm_matrix_instance_q7 BT;
  238. #ifdef ARM_MATH_MATRIX_CHECK
  239. /* Check for matrix mismatch condition */
  240. if ((pSrcA->numCols != pSrcB->numRows) ||
  241. (pSrcA->numRows != pDst->numRows) ||
  242. (pSrcB->numCols != pDst->numCols) )
  243. {
  244. /* Set status as ARM_MATH_SIZE_MISMATCH */
  245. status = ARM_MATH_SIZE_MISMATCH;
  246. }
  247. else
  248. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  249. {
  250. /* small squared matrix specialized routines */
  251. if(numRowsA == numColsB && numColsB == numColsA) {
  252. if(numRowsA == 2)
  253. return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
  254. else if(numRowsA == 3)
  255. return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
  256. else if (numRowsA == 4)
  257. return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
  258. }
  259. /*
  260. * Matrix transpose
  261. */
  262. BT.numRows = numColsB;
  263. BT.numCols = numRowsB;
  264. BT.pData = pSrcBT;
  265. arm_mat_trans_q7(pSrcB, &BT);
  266. /*
  267. * Reset the variables for the usage in the following multiplication process
  268. */
  269. i = 0;
  270. row = numRowsA >> 1;
  271. px = pDst->pData;
  272. px2 = px + numColsB;
  273. /*
  274. * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
  275. */
  276. /*
  277. * row loop
  278. */
  279. while (row > 0u)
  280. {
  281. /*
  282. * For every row wise process, the column loop counter is to be initiated
  283. */
  284. col = numColsB >> 1;
  285. /*
  286. * For every row wise process, the pIn2 pointer is set
  287. * to the starting address of the transposed pSrcB data
  288. */
  289. pInB = pSrcBT;
  290. pInB2 = pInB + numRowsB;
  291. j = 0;
  292. /*
  293. * column loop
  294. */
  295. while (col > 0u)
  296. {
  297. q7_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
  298. q7x16_t vecA, vecA2, vecB, vecB2;
  299. q31_t acc0, acc1, acc2, acc3;
  300. /*
  301. * Initiate the pointer pIn1 to point to the starting address of the column being processed
  302. */
  303. pInA = pSrcA->pData + i;
  304. pInA2 = pInA + numColsA;
  305. pInB = pSrcBT + j;
  306. pInB2 = pInB + numRowsB;
  307. pSrcAVec = (q7_t const *) pInA;
  308. pSrcA2Vec = (q7_t const *)pInA2;
  309. pSrcBVec = (q7_t const *) pInB;
  310. pSrcB2Vec = (q7_t const *)pInB2;
  311. acc0 = 0L;
  312. acc1 = 0L;
  313. acc2 = 0L;
  314. acc3 = 0L;
  315. vecA = vld1q(pSrcAVec);
  316. pSrcAVec += 16;
  317. blkCnt = numColsA >> 4;
  318. while (blkCnt > 0U)
  319. {
  320. vecB = vld1q(pSrcBVec);
  321. pSrcBVec += 16;
  322. acc0 = vmladavaq_s8(acc0, vecA, vecB);
  323. vecA2 = vld1q(pSrcA2Vec);
  324. pSrcA2Vec += 16;
  325. acc1 = vmladavaq_s8(acc1, vecA2, vecB);
  326. vecB2 = vld1q(pSrcB2Vec);
  327. pSrcB2Vec += 16;
  328. acc2 = vmladavaq_s8(acc2, vecA, vecB2);
  329. vecA = vld1q(pSrcAVec);
  330. pSrcAVec += 16;
  331. acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
  332. blkCnt--;
  333. }
  334. /*
  335. * tail
  336. * (will be merged thru tail predication)
  337. */
  338. blkCnt = numColsA & 0xF;
  339. if (blkCnt > 0U)
  340. {
  341. mve_pred16_t p0 = vctp8q(blkCnt);
  342. vecB = vld1q(pSrcBVec);
  343. acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
  344. vecA2 = vld1q(pSrcA2Vec);
  345. acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
  346. vecB2 = vld1q(pSrcB2Vec);
  347. acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
  348. vecA = vld1q(pSrcAVec);
  349. acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
  350. }
  351. *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
  352. *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
  353. *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
  354. *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
  355. j += numRowsB * 2;
  356. /*
  357. * Decrement the column loop counter
  358. */
  359. col--;
  360. }
  361. i = i + numColsA * 2;
  362. px = px2 + (numColsB & 1u);
  363. px2 = px + numColsB;
  364. /*
  365. * Decrement the row loop counter
  366. */
  367. row--;
  368. }
  369. /*
  370. * Compute remaining row and/or column below
  371. */
  372. if (numColsB & 1u)
  373. {
  374. row = numRowsA & (~0x1); //avoid redundant computation
  375. px = pDst->pData + numColsB - 1;
  376. i = 0;
  377. /*
  378. * row loop
  379. */
  380. while (row > 0)
  381. {
  382. q7_t const *pSrcAVec, *pSrcBVec;
  383. q7x16_t vecA, vecB;
  384. q63_t acc0;
  385. /*
  386. * point to last column in matrix B
  387. */
  388. pInB = pSrcBT + numRowsB * (numColsB - 1);
  389. pInA = pSrcA->pData + i;
  390. pSrcAVec = (q7_t const *) pInA;
  391. pSrcBVec = (q7_t const *) pInB;
  392. acc0 = 0LL;
  393. blkCnt = (numColsA) >> 4;
  394. while (blkCnt > 0U)
  395. {
  396. vecA = vld1q(pSrcAVec);
  397. pSrcAVec += 16;
  398. vecB = vld1q(pSrcBVec);
  399. pSrcBVec += 16;
  400. acc0 = vmladavaq_s8(acc0, vecA, vecB);
  401. blkCnt--;
  402. }
  403. /*
  404. * tail
  405. * (will be merged thru tail predication)
  406. */
  407. blkCnt = numColsA & 0xF;
  408. if (blkCnt > 0U)
  409. {
  410. mve_pred16_t p0 = vctp8q(blkCnt);
  411. vecA = vld1q(pSrcAVec);
  412. vecB = vld1q(pSrcBVec);
  413. acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
  414. }
  415. *px = (q7_t) __SSAT(acc0 >> 7, 8);
  416. px += numColsB;
  417. i += numColsA;
  418. /*
  419. * Decrement the row loop counter
  420. */
  421. row--;
  422. }
  423. }
  424. if (numRowsA & 1u)
  425. {
  426. col = numColsB;
  427. i = 0u;
  428. /*
  429. * point to last row in output matrix
  430. */
  431. px = pDst->pData + (numColsB) * (numRowsA - 1);
  432. /*
  433. * col loop
  434. */
  435. while (col > 0)
  436. {
  437. q7_t const *pSrcAVec, *pSrcBVec;
  438. q7x16_t vecA, vecB;
  439. q63_t acc0;
  440. /*
  441. * point to last row in matrix A
  442. */
  443. pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
  444. pInB = pSrcBT + i;
  445. /*
  446. * Set the variable sum, that acts as accumulator, to zero
  447. */
  448. pSrcAVec = (q7_t const *) pInA;
  449. pSrcBVec = (q7_t const *) pInB;
  450. acc0 = 0LL;
  451. blkCnt = (numColsA) >> 4;
  452. while (blkCnt > 0U)
  453. {
  454. vecA = vld1q(pSrcAVec);
  455. pSrcAVec += 16;
  456. vecB = vld1q(pSrcBVec);
  457. pSrcBVec += 16;
  458. acc0 = vmladavaq_s8(acc0, vecA, vecB);
  459. blkCnt--;
  460. }
  461. /*
  462. * tail
  463. * (will be merged thru tail predication)
  464. */
  465. blkCnt = numColsA & 0xF;
  466. if (blkCnt > 0U)
  467. {
  468. mve_pred16_t p0 = vctp8q(blkCnt);
  469. vecA = vld1q(pSrcAVec);
  470. vecB = vld1q(pSrcBVec);
  471. acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
  472. }
  473. *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
  474. i += numColsA;
  475. /*
  476. * Decrement the col loop counter
  477. */
  478. col--;
  479. }
  480. }
  481. /*
  482. * Return to application
  483. */
  484. status = ARM_MATH_SUCCESS;
  485. }
  486. return(status);
  487. }
  488. #else
  489. arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
  490. {
  491. q31_t sum; /* accumulator */
  492. q7_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
  493. q7_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
  494. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
  495. q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
  496. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  497. q7_t *px; /* Temporary output data matrix pointer */
  498. uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
  499. uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
  500. uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
  501. uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
  502. arm_status status; /* status of matrix multiplication */
  503. (void)pState;
  504. #ifdef ARM_MATH_MATRIX_CHECK
  505. /* Check for matrix mismatch condition */
  506. if ((pSrcA->numCols != pSrcB->numRows) ||
  507. (pSrcA->numRows != pDst->numRows) ||
  508. (pSrcB->numCols != pDst->numCols) )
  509. {
  510. /* Set status as ARM_MATH_SIZE_MISMATCH */
  511. status = ARM_MATH_SIZE_MISMATCH;
  512. }
  513. else
  514. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  515. {
  516. /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  517. /* row loop */
  518. do {
  519. /* Output pointer is set to starting address of the row being processed */
  520. px = pOut + i;
  521. /* For every row wise process, the column loop counter is to be initiated */
  522. col = numColsB;
  523. /* For every row wise process, the pIn2 pointer is set
  524. ** to the starting address of the pSrcB data */
  525. pIn2 = pSrcB->pData;
  526. /* column loop */
  527. do {
  528. /* Set the variable sum, that acts as accumulator, to zero */
  529. sum = 0;
  530. /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
  531. pIn1 = pInA;
  532. /* Matrix A columns number of MAC operations are to be performed */
  533. colCnt = numColsA;
  534. /* matrix multiplication */
  535. while (colCnt > 0U) {
  536. /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  537. /* Perform the multiply-accumulates */
  538. sum += (q31_t)*pIn1++ * *pIn2;
  539. pIn2 += numColsB;
  540. /* Decrement the loop counter */
  541. colCnt--;
  542. }
  543. /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
  544. /* Saturate and store the result in the destination buffer */
  545. *px++ = (q7_t)__SSAT((sum >> 7), 8);
  546. /* Decrement the column loop counter */
  547. col--;
  548. /* Update the pointer pIn2 to point to the starting address of the next column */
  549. pIn2 = pInB + (numColsB - col);
  550. } while (col > 0U);
  551. /* Update the pointer pSrcA to point to the starting address of the next row */
  552. i = i + numColsB;
  553. pInA = pInA + numColsA;
  554. /* Decrement the row loop counter */
  555. row--;
  556. } while (row > 0U);
  557. /* set status as ARM_MATH_SUCCESS */
  558. status = ARM_MATH_SUCCESS;
  559. }
  560. /* Return to application */
  561. return (status);
  562. }
  563. #endif /* defined(ARM_MATH_MVEI) */
  564. /**
  565. @} end of MatrixMult group
  566. */