arm_mat_mult_q7.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_mat_mult_q7.c
  4. * Description: Q15 matrix multiplication
  5. *
  6. * $Date: 23 April 2021
  7. *
  8. * $Revision: V1.9.0
  9. *
  10. * Target Processor: Cortex-M and Cortex-A cores
  11. * -------------------------------------------------------------------- */
  12. /*
  13. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  14. *
  15. * SPDX-License-Identifier: Apache-2.0
  16. *
  17. * Licensed under the Apache License, Version 2.0 (the License); you may
  18. * not use this file except in compliance with the License.
  19. * You may obtain a copy of the License at
  20. *
  21. * www.apache.org/licenses/LICENSE-2.0
  22. *
  23. * Unless required by applicable law or agreed to in writing, software
  24. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  25. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  26. * See the License for the specific language governing permissions and
  27. * limitations under the License.
  28. */
  29. #include "dsp/matrix_functions.h"
  30. /**
  31. @ingroup groupMatrix
  32. */
  33. /**
  34. @addtogroup MatrixMult
  35. @{
  36. */
  37. /**
  38. * @brief Q7 matrix multiplication
  39. * @param[in] *pSrcA points to the first input matrix structure
  40. * @param[in] *pSrcB points to the second input matrix structure
  41. * @param[out] *pDst points to output matrix structure
  42. * @param[in] *pState points to the array for storing intermediate results (Unused in some versions)
  43. * @return The function returns either
  44. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  45. *
  46. * @details
  47. * <b>Scaling and Overflow Behavior:</b>
  48. *
  49. * \par
  50. * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
  51. *
  52. *
  53. */
  54. #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
  55. __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
  56. const arm_matrix_instance_q7 * pSrcA,
  57. const arm_matrix_instance_q7 * pSrcB,
  58. arm_matrix_instance_q7 * pDst)
  59. {
  60. const uint32_t MATRIX_DIM = 2;
  61. q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
  62. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  63. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  64. uint8x16_t vecColBOffs;
  65. q7_t *pInA0 = pInA;
  66. q7_t *pInA1 = pInA0 + MATRIX_DIM;
  67. q31_t acc0, acc1;
  68. q7x16_t vecB, vecA0, vecA1;
  69. mve_pred16_t p0 = vctp8q(MATRIX_DIM);
  70. vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
  71. pInB = pSrcB->pData;
  72. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  73. vecA0 = vldrbq_s8(pInA0);
  74. vecA1 = vldrbq_s8(pInA1);
  75. acc0 = vmladavq_s8(vecA0, vecB);
  76. acc1 = vmladavq_s8(vecA1, vecB);
  77. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  78. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  79. pOut++;
  80. /* move to next B column */
  81. pInB = pInB + 1;
  82. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  83. acc0 = vmladavq_s8(vecA0, vecB);
  84. acc1 = vmladavq_s8(vecA1, vecB);
  85. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  86. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  87. /*
  88. * Return to application
  89. */
  90. return (ARM_MATH_SUCCESS);
  91. }
  92. __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
  93. const arm_matrix_instance_q7 * pSrcA,
  94. const arm_matrix_instance_q7 * pSrcB,
  95. arm_matrix_instance_q7 * pDst)
  96. {
  97. const uint8_t MATRIX_DIM = 3;
  98. q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
  99. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  100. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  101. uint8x16_t vecColBOffs;
  102. q7_t *pInA0 = pInA;
  103. q7_t *pInA1 = pInA0 + MATRIX_DIM;
  104. q7_t *pInA2 = pInA1 + MATRIX_DIM;
  105. q31_t acc0, acc1, acc2;
  106. q7x16_t vecB, vecA0, vecA1, vecA2;
  107. mve_pred16_t p0 = vctp8q(MATRIX_DIM);
  108. vecColBOffs = vidupq_u8((uint32_t)0, 1);
  109. vecColBOffs = vecColBOffs * MATRIX_DIM;
  110. pInB = pSrcB->pData;
  111. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  112. vecA0 = vldrbq_s8(pInA0);
  113. vecA1 = vldrbq_s8(pInA1);
  114. vecA2 = vldrbq_s8(pInA2);
  115. acc0 = vmladavq_s8(vecA0, vecB);
  116. acc1 = vmladavq_s8(vecA1, vecB);
  117. acc2 = vmladavq_s8(vecA2, vecB);
  118. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  119. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  120. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  121. pOut++;
  122. /* move to next B column */
  123. pInB = pInB + 1;
  124. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  125. acc0 = vmladavq_s8(vecA0, vecB);
  126. acc1 = vmladavq_s8(vecA1, vecB);
  127. acc2 = vmladavq_s8(vecA2, vecB);
  128. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  129. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  130. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  131. pOut++;
  132. /* move to next B column */
  133. pInB = pInB + 1;
  134. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  135. acc0 = vmladavq_s8(vecA0, vecB);
  136. acc1 = vmladavq_s8(vecA1, vecB);
  137. acc2 = vmladavq_s8(vecA2, vecB);
  138. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  139. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  140. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  141. /*
  142. * Return to application
  143. */
  144. return (ARM_MATH_SUCCESS);
  145. }
  146. __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
  147. const arm_matrix_instance_q7 * pSrcA,
  148. const arm_matrix_instance_q7 * pSrcB,
  149. arm_matrix_instance_q7 * pDst)
  150. {
  151. const uint32_t MATRIX_DIM = 4;
  152. q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
  153. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  154. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  155. uint8x16_t vecColBOffs;
  156. q7_t *pInA0 = pInA;
  157. q7_t *pInA1 = pInA0 + MATRIX_DIM;
  158. q7_t *pInA2 = pInA1 + MATRIX_DIM;
  159. q7_t *pInA3 = pInA2 + MATRIX_DIM;
  160. q31_t acc0, acc1, acc2, acc3;
  161. q7x16_t vecB, vecA0, vecA1, vecA2, vecA3;
  162. mve_pred16_t p0 = vctp8q(MATRIX_DIM);
  163. vecColBOffs = vidupq_u8((uint32_t)0, 4);
  164. pInB = pSrcB->pData;
  165. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  166. vecA0 = vldrbq_s8(pInA0);
  167. vecA1 = vldrbq_s8(pInA1);
  168. vecA2 = vldrbq_s8(pInA2);
  169. vecA3 = vldrbq_s8(pInA3);
  170. acc0 = vmladavq_s8(vecA0, vecB);
  171. acc1 = vmladavq_s8(vecA1, vecB);
  172. acc2 = vmladavq_s8(vecA2, vecB);
  173. acc3 = vmladavq_s8(vecA3, vecB);
  174. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  175. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  176. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  177. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  178. pOut++;
  179. /* move to next B column */
  180. pInB = pInB + 1;
  181. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  182. acc0 = vmladavq_s8(vecA0, vecB);
  183. acc1 = vmladavq_s8(vecA1, vecB);
  184. acc2 = vmladavq_s8(vecA2, vecB);
  185. acc3 = vmladavq_s8(vecA3, vecB);
  186. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  187. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  188. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  189. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  190. pOut++;
  191. /* move to next B column */
  192. pInB = pInB + 1;
  193. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  194. acc0 = vmladavq_s8(vecA0, vecB);
  195. acc1 = vmladavq_s8(vecA1, vecB);
  196. acc2 = vmladavq_s8(vecA2, vecB);
  197. acc3 = vmladavq_s8(vecA3, vecB);
  198. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  199. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  200. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  201. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  202. pOut++;
  203. /* move to next B column */
  204. pInB = pInB + 1;
  205. vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
  206. acc0 = vmladavq_s8(vecA0, vecB);
  207. acc1 = vmladavq_s8(vecA1, vecB);
  208. acc2 = vmladavq_s8(vecA2, vecB);
  209. acc3 = vmladavq_s8(vecA3, vecB);
  210. pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
  211. pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
  212. pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
  213. pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
  214. /*
  215. * Return to application
  216. */
  217. return (ARM_MATH_SUCCESS);
  218. }
  219. arm_status arm_mat_mult_q7(
  220. const arm_matrix_instance_q7 * pSrcA,
  221. const arm_matrix_instance_q7 * pSrcB,
  222. arm_matrix_instance_q7 * pDst,
  223. q7_t * pState)
  224. {
  225. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
  226. q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
  227. q7_t *pInA2;
  228. q7_t *pInB2;
  229. q7_t *px; /* Temporary output data matrix pointer */
  230. q7_t *px2; /* Temporary output data matrix pointer */
  231. uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
  232. uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
  233. uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
  234. uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
  235. uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
  236. q7_t *pSrcBT = pState; /* input data matrix pointer for transpose */
  237. uint32_t blkCnt; /* loop counters */
  238. arm_status status; /* status of matrix multiplication */
  239. arm_matrix_instance_q7 BT;
  240. #ifdef ARM_MATH_MATRIX_CHECK
  241. /* Check for matrix mismatch condition */
  242. if ((pSrcA->numCols != pSrcB->numRows) ||
  243. (pSrcA->numRows != pDst->numRows) ||
  244. (pSrcB->numCols != pDst->numCols) )
  245. {
  246. /* Set status as ARM_MATH_SIZE_MISMATCH */
  247. status = ARM_MATH_SIZE_MISMATCH;
  248. }
  249. else
  250. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  251. {
  252. /* small squared matrix specialized routines */
  253. if(numRowsA == numColsB && numColsB == numColsA) {
  254. if(numRowsA == 2)
  255. return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
  256. else if(numRowsA == 3)
  257. return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
  258. else if (numRowsA == 4)
  259. return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
  260. }
  261. /*
  262. * Matrix transpose
  263. */
  264. BT.numRows = numColsB;
  265. BT.numCols = numRowsB;
  266. BT.pData = pSrcBT;
  267. arm_mat_trans_q7(pSrcB, &BT);
  268. /*
  269. * Reset the variables for the usage in the following multiplication process
  270. */
  271. i = 0;
  272. row = numRowsA >> 1;
  273. px = pDst->pData;
  274. px2 = px + numColsB;
  275. /*
  276. * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
  277. */
  278. /*
  279. * row loop
  280. */
  281. while (row > 0u)
  282. {
  283. /*
  284. * For every row wise process, the column loop counter is to be initiated
  285. */
  286. col = numColsB >> 1;
  287. /*
  288. * For every row wise process, the pIn2 pointer is set
  289. * to the starting address of the transposed pSrcB data
  290. */
  291. pInB = pSrcBT;
  292. pInB2 = pInB + numRowsB;
  293. j = 0;
  294. /*
  295. * column loop
  296. */
  297. while (col > 0u)
  298. {
  299. q7_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
  300. q7x16_t vecA, vecA2, vecB, vecB2;
  301. q31_t acc0, acc1, acc2, acc3;
  302. /*
  303. * Initiate the pointer pIn1 to point to the starting address of the column being processed
  304. */
  305. pInA = pSrcA->pData + i;
  306. pInA2 = pInA + numColsA;
  307. pInB = pSrcBT + j;
  308. pInB2 = pInB + numRowsB;
  309. pSrcAVec = (q7_t const *) pInA;
  310. pSrcA2Vec = (q7_t const *)pInA2;
  311. pSrcBVec = (q7_t const *) pInB;
  312. pSrcB2Vec = (q7_t const *)pInB2;
  313. acc0 = 0L;
  314. acc1 = 0L;
  315. acc2 = 0L;
  316. acc3 = 0L;
  317. vecA = vld1q(pSrcAVec);
  318. pSrcAVec += 16;
  319. blkCnt = numColsA >> 4;
  320. while (blkCnt > 0U)
  321. {
  322. vecB = vld1q(pSrcBVec);
  323. pSrcBVec += 16;
  324. acc0 = vmladavaq_s8(acc0, vecA, vecB);
  325. vecA2 = vld1q(pSrcA2Vec);
  326. pSrcA2Vec += 16;
  327. acc1 = vmladavaq_s8(acc1, vecA2, vecB);
  328. vecB2 = vld1q(pSrcB2Vec);
  329. pSrcB2Vec += 16;
  330. acc2 = vmladavaq_s8(acc2, vecA, vecB2);
  331. vecA = vld1q(pSrcAVec);
  332. pSrcAVec += 16;
  333. acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
  334. blkCnt--;
  335. }
  336. /*
  337. * tail
  338. * (will be merged thru tail predication)
  339. */
  340. blkCnt = numColsA & 0xF;
  341. if (blkCnt > 0U)
  342. {
  343. mve_pred16_t p0 = vctp8q(blkCnt);
  344. vecB = vld1q(pSrcBVec);
  345. acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
  346. vecA2 = vld1q(pSrcA2Vec);
  347. acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
  348. vecB2 = vld1q(pSrcB2Vec);
  349. acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
  350. vecA = vld1q(pSrcAVec);
  351. acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
  352. }
  353. *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
  354. *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
  355. *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
  356. *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
  357. j += numRowsB * 2;
  358. /*
  359. * Decrement the column loop counter
  360. */
  361. col--;
  362. }
  363. i = i + numColsA * 2;
  364. px = px2 + (numColsB & 1u);
  365. px2 = px + numColsB;
  366. /*
  367. * Decrement the row loop counter
  368. */
  369. row--;
  370. }
  371. /*
  372. * Compute remaining row and/or column below
  373. */
  374. if (numColsB & 1u)
  375. {
  376. row = numRowsA & (~0x1); //avoid redundant computation
  377. px = pDst->pData + numColsB - 1;
  378. i = 0;
  379. /*
  380. * row loop
  381. */
  382. while (row > 0)
  383. {
  384. q7_t const *pSrcAVec, *pSrcBVec;
  385. q7x16_t vecA, vecB;
  386. q63_t acc0;
  387. /*
  388. * point to last column in matrix B
  389. */
  390. pInB = pSrcBT + numRowsB * (numColsB - 1);
  391. pInA = pSrcA->pData + i;
  392. pSrcAVec = (q7_t const *) pInA;
  393. pSrcBVec = (q7_t const *) pInB;
  394. acc0 = 0LL;
  395. blkCnt = (numColsA) >> 4;
  396. while (blkCnt > 0U)
  397. {
  398. vecA = vld1q(pSrcAVec);
  399. pSrcAVec += 16;
  400. vecB = vld1q(pSrcBVec);
  401. pSrcBVec += 16;
  402. acc0 = vmladavaq_s8(acc0, vecA, vecB);
  403. blkCnt--;
  404. }
  405. /*
  406. * tail
  407. * (will be merged thru tail predication)
  408. */
  409. blkCnt = numColsA & 0xF;
  410. if (blkCnt > 0U)
  411. {
  412. mve_pred16_t p0 = vctp8q(blkCnt);
  413. vecA = vld1q(pSrcAVec);
  414. vecB = vld1q(pSrcBVec);
  415. acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
  416. }
  417. *px = (q7_t) __SSAT(acc0 >> 7, 8);
  418. px += numColsB;
  419. i += numColsA;
  420. /*
  421. * Decrement the row loop counter
  422. */
  423. row--;
  424. }
  425. }
  426. if (numRowsA & 1u)
  427. {
  428. col = numColsB;
  429. i = 0u;
  430. /*
  431. * point to last row in output matrix
  432. */
  433. px = pDst->pData + (numColsB) * (numRowsA - 1);
  434. /*
  435. * col loop
  436. */
  437. while (col > 0)
  438. {
  439. q7_t const *pSrcAVec, *pSrcBVec;
  440. q7x16_t vecA, vecB;
  441. q63_t acc0;
  442. /*
  443. * point to last row in matrix A
  444. */
  445. pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
  446. pInB = pSrcBT + i;
  447. /*
  448. * Set the variable sum, that acts as accumulator, to zero
  449. */
  450. pSrcAVec = (q7_t const *) pInA;
  451. pSrcBVec = (q7_t const *) pInB;
  452. acc0 = 0LL;
  453. blkCnt = (numColsA) >> 4;
  454. while (blkCnt > 0U)
  455. {
  456. vecA = vld1q(pSrcAVec);
  457. pSrcAVec += 16;
  458. vecB = vld1q(pSrcBVec);
  459. pSrcBVec += 16;
  460. acc0 = vmladavaq_s8(acc0, vecA, vecB);
  461. blkCnt--;
  462. }
  463. /*
  464. * tail
  465. * (will be merged thru tail predication)
  466. */
  467. blkCnt = numColsA & 0xF;
  468. if (blkCnt > 0U)
  469. {
  470. mve_pred16_t p0 = vctp8q(blkCnt);
  471. vecA = vld1q(pSrcAVec);
  472. vecB = vld1q(pSrcBVec);
  473. acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
  474. }
  475. *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
  476. i += numColsA;
  477. /*
  478. * Decrement the col loop counter
  479. */
  480. col--;
  481. }
  482. }
  483. /*
  484. * Return to application
  485. */
  486. status = ARM_MATH_SUCCESS;
  487. }
  488. return(status);
  489. }
  490. #else
  491. arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
  492. {
  493. q31_t sum; /* accumulator */
  494. q7_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
  495. q7_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
  496. q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
  497. q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
  498. q7_t *pOut = pDst->pData; /* output data matrix pointer */
  499. q7_t *px; /* Temporary output data matrix pointer */
  500. uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
  501. uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
  502. uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
  503. uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
  504. arm_status status; /* status of matrix multiplication */
  505. (void)pState;
  506. #ifdef ARM_MATH_MATRIX_CHECK
  507. /* Check for matrix mismatch condition */
  508. if ((pSrcA->numCols != pSrcB->numRows) ||
  509. (pSrcA->numRows != pDst->numRows) ||
  510. (pSrcB->numCols != pDst->numCols) )
  511. {
  512. /* Set status as ARM_MATH_SIZE_MISMATCH */
  513. status = ARM_MATH_SIZE_MISMATCH;
  514. }
  515. else
  516. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  517. {
  518. /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
  519. /* row loop */
  520. do {
  521. /* Output pointer is set to starting address of the row being processed */
  522. px = pOut + i;
  523. /* For every row wise process, the column loop counter is to be initiated */
  524. col = numColsB;
  525. /* For every row wise process, the pIn2 pointer is set
  526. ** to the starting address of the pSrcB data */
  527. pIn2 = pSrcB->pData;
  528. /* column loop */
  529. do {
  530. /* Set the variable sum, that acts as accumulator, to zero */
  531. sum = 0;
  532. /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
  533. pIn1 = pInA;
  534. /* Matrix A columns number of MAC operations are to be performed */
  535. colCnt = numColsA;
  536. /* matrix multiplication */
  537. while (colCnt > 0U) {
  538. /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
  539. /* Perform the multiply-accumulates */
  540. sum += (q31_t)*pIn1++ * *pIn2;
  541. pIn2 += numColsB;
  542. /* Decrement the loop counter */
  543. colCnt--;
  544. }
  545. /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
  546. /* Saturate and store the result in the destination buffer */
  547. *px++ = (q7_t)__SSAT((sum >> 7), 8);
  548. /* Decrement the column loop counter */
  549. col--;
  550. /* Update the pointer pIn2 to point to the starting address of the next column */
  551. pIn2 = pInB + (numColsB - col);
  552. } while (col > 0U);
  553. /* Update the pointer pSrcA to point to the starting address of the next row */
  554. i = i + numColsB;
  555. pInA = pInA + numColsA;
  556. /* Decrement the row loop counter */
  557. row--;
  558. } while (row > 0U);
  559. /* set status as ARM_MATH_SUCCESS */
  560. status = ARM_MATH_SUCCESS;
  561. }
  562. /* Return to application */
  563. return (status);
  564. }
  565. #endif /* defined(ARM_MATH_MVEI) */
  566. /**
  567. @} end of MatrixMult group
  568. */