arm_mat_trans_q15.c 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_mat_trans_q15.c
  4. * Description: Q15 matrix transpose
  5. *
  6. * $Date: 18. March 2019
  7. * $Revision: V1.6.0
  8. *
  9. * Target Processor: Cortex-M cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "arm_math.h"
  29. /**
  30. @ingroup groupMatrix
  31. */
  32. /**
  33. @addtogroup MatrixTrans
  34. @{
  35. */
  36. /**
  37. @brief Q15 matrix transpose.
  38. @param[in] pSrc points to input matrix
  39. @param[out] pDst points to output matrix
  40. @return execution status
  41. - \ref ARM_MATH_SUCCESS : Operation successful
  42. - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  43. */
  44. #if defined(ARM_MATH_MVEI)
  45. __STATIC_INLINE arm_status arm_mat_trans_16bit_2x2(uint16_t * pDataSrc, uint16_t * pDataDest)
  46. {
  47. pDataDest[0] = pDataSrc[0];
  48. pDataDest[3] = pDataSrc[3];
  49. pDataDest[2] = pDataSrc[1];
  50. pDataDest[1] = pDataSrc[2];
  51. return (ARM_MATH_SUCCESS);
  52. }
  53. static arm_status arm_mat_trans_16bit_3x3_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
  54. {
  55. static const uint16_t stridesTr33[8] = { 0, 3, 6, 1, 4, 7, 2, 5 };
  56. uint16x8_t vecOffs1;
  57. uint16x8_t vecIn1;
  58. /*
  59. *
  60. * | 0 1 2 | | 0 3 6 | 8 x 16 flattened version | 0 3 6 1 4 7 2 5 |
  61. * | 3 4 5 | => | 1 4 7 | => | 8 . . . . . . . |
  62. * | 6 7 8 | | 2 5 8 | (row major)
  63. *
  64. */
  65. vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr33);
  66. vecIn1 = vldrhq_u16((uint16_t const *) pDataSrc);
  67. vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
  68. pDataDest[8] = pDataSrc[8];
  69. return (ARM_MATH_SUCCESS);
  70. }
  71. static arm_status arm_mat_trans_16bit_4x4_mve(uint16_t * pDataSrc, uint16_t * pDataDest)
  72. {
  73. static const uint16_t stridesTr44_1[8] = { 0, 4, 8, 12, 1, 5, 9, 13 };
  74. static const uint16_t stridesTr44_2[8] = { 2, 6, 10, 14, 3, 7, 11, 15 };
  75. uint16x8_t vecOffs1, vecOffs2;
  76. uint16x8_t vecIn1, vecIn2;
  77. uint16_t const * pDataSrcVec = (uint16_t const *) pDataSrc;
  78. /*
  79. * 4x4 Matrix transposition
  80. *
  81. * | 0 1 2 3 | | 0 4 8 12 | 8 x 16 flattened version
  82. * | 4 5 6 7 | => | 1 5 9 13 | => [0 4 8 12 1 5 9 13]
  83. * | 8 9 10 11 | | 2 6 10 14 | [2 6 10 14 3 7 11 15]
  84. * | 12 13 14 15 | | 3 7 11 15 |
  85. */
  86. vecOffs1 = vldrhq_u16((uint16_t const *) stridesTr44_1);
  87. vecOffs2 = vldrhq_u16((uint16_t const *) stridesTr44_2);
  88. vecIn1 = vldrhq_u16(pDataSrcVec);
  89. pDataSrcVec += 8;
  90. vecIn2 = vldrhq_u16(pDataSrcVec);
  91. vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs1, vecIn1);
  92. vstrhq_scatter_shifted_offset_u16(pDataDest, vecOffs2, vecIn2);
  93. return (ARM_MATH_SUCCESS);
  94. }
  95. static arm_status arm_mat_trans_16bit_generic(
  96. uint16_t srcRows,
  97. uint16_t srcCols,
  98. uint16_t * pDataSrc,
  99. uint16_t * pDataDest)
  100. {
  101. uint16x8_t vecOffs;
  102. uint32_t i;
  103. uint32_t blkCnt;
  104. uint16_t const *pDataC;
  105. uint16_t *pDataDestR;
  106. uint16x8_t vecIn;
  107. vecOffs = vidupq_u16((uint32_t)0, 1);
  108. vecOffs = vecOffs * srcCols;
  109. i = srcCols;
  110. while(i > 0U)
  111. {
  112. pDataC = (uint16_t const *) pDataSrc;
  113. pDataDestR = pDataDest;
  114. blkCnt = srcRows >> 3;
  115. while (blkCnt > 0U)
  116. {
  117. vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
  118. vstrhq_u16(pDataDestR, vecIn);
  119. pDataDestR += 8;
  120. pDataC = pDataC + srcCols * 8;
  121. /*
  122. * Decrement the blockSize loop counter
  123. */
  124. blkCnt--;
  125. }
  126. /*
  127. * tail
  128. */
  129. blkCnt = srcRows & 7;
  130. if (blkCnt > 0U)
  131. {
  132. mve_pred16_t p0 = vctp16q(blkCnt);
  133. vecIn = vldrhq_gather_shifted_offset_u16(pDataC, vecOffs);
  134. vstrhq_p_u16(pDataDestR, vecIn, p0);
  135. }
  136. pDataSrc += 1;
  137. pDataDest += srcRows;
  138. i--;
  139. }
  140. return (ARM_MATH_SUCCESS);
  141. }
  142. arm_status arm_mat_trans_q15(
  143. const arm_matrix_instance_q15 * pSrc,
  144. arm_matrix_instance_q15 * pDst)
  145. {
  146. arm_status status; /* status of matrix transpose */
  147. #ifdef ARM_MATH_MATRIX_CHECK
  148. /* Check for matrix mismatch condition */
  149. if ((pSrc->numRows != pDst->numCols) ||
  150. (pSrc->numCols != pDst->numRows) )
  151. {
  152. /* Set status as ARM_MATH_SIZE_MISMATCH */
  153. status = ARM_MATH_SIZE_MISMATCH;
  154. }
  155. else
  156. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  157. {
  158. if (pDst->numRows == pDst->numCols)
  159. {
  160. if (pDst->numCols == 1)
  161. {
  162. pDst->pData[0] = pSrc->pData[0];
  163. return(ARM_MATH_SUCCESS);
  164. }
  165. if (pDst->numCols == 2)
  166. return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
  167. if (pDst->numCols == 3)
  168. return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
  169. if (pDst->numCols == 4)
  170. return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
  171. }
  172. arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
  173. /* Set status as ARM_MATH_SUCCESS */
  174. status = ARM_MATH_SUCCESS;
  175. }
  176. /* Return to application */
  177. return (status);
  178. }
  179. #else
  180. arm_status arm_mat_trans_q15(
  181. const arm_matrix_instance_q15 * pSrc,
  182. arm_matrix_instance_q15 * pDst)
  183. {
  184. q15_t *pIn = pSrc->pData; /* input data matrix pointer */
  185. q15_t *pOut = pDst->pData; /* output data matrix pointer */
  186. uint16_t nRows = pSrc->numRows; /* number of rows */
  187. uint16_t nCols = pSrc->numCols; /* number of columns */
  188. uint32_t col, row = nRows, i = 0U; /* Loop counters */
  189. arm_status status; /* status of matrix transpose */
  190. #if defined (ARM_MATH_LOOPUNROLL)
  191. q31_t in; /* variable to hold temporary output */
  192. #endif
  193. #ifdef ARM_MATH_MATRIX_CHECK
  194. /* Check for matrix mismatch condition */
  195. if ((pSrc->numRows != pDst->numCols) ||
  196. (pSrc->numCols != pDst->numRows) )
  197. {
  198. /* Set status as ARM_MATH_SIZE_MISMATCH */
  199. status = ARM_MATH_SIZE_MISMATCH;
  200. }
  201. else
  202. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  203. {
  204. /* Matrix transpose by exchanging the rows with columns */
  205. /* row loop */
  206. do
  207. {
  208. /* Pointer pOut is set to starting address of column being processed */
  209. pOut = pDst->pData + i;
  210. #if defined (ARM_MATH_LOOPUNROLL)
  211. /* Loop unrolling: Compute 4 outputs at a time */
  212. col = nCols >> 2U;
  213. while (col > 0U) /* column loop */
  214. {
  215. /* Read two elements from row */
  216. in = read_q15x2_ia ((q15_t **) &pIn);
  217. /* Unpack and store one element in destination */
  218. #ifndef ARM_MATH_BIG_ENDIAN
  219. *pOut = (q15_t) in;
  220. #else
  221. *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  222. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  223. /* Update pointer pOut to point to next row of transposed matrix */
  224. pOut += nRows;
  225. /* Unpack and store second element in destination */
  226. #ifndef ARM_MATH_BIG_ENDIAN
  227. *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  228. #else
  229. *pOut = (q15_t) in;
  230. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  231. /* Update pointer pOut to point to next row of transposed matrix */
  232. pOut += nRows;
  233. /* Read two elements from row */
  234. in = read_q15x2_ia ((q15_t **) &pIn);
  235. /* Unpack and store one element in destination */
  236. #ifndef ARM_MATH_BIG_ENDIAN
  237. *pOut = (q15_t) in;
  238. #else
  239. *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  240. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  241. /* Update pointer pOut to point to next row of transposed matrix */
  242. pOut += nRows;
  243. /* Unpack and store second element in destination */
  244. #ifndef ARM_MATH_BIG_ENDIAN
  245. *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
  246. #else
  247. *pOut = (q15_t) in;
  248. #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
  249. /* Update pointer pOut to point to next row of transposed matrix */
  250. pOut += nRows;
  251. /* Decrement column loop counter */
  252. col--;
  253. }
  254. /* Loop unrolling: Compute remaining outputs */
  255. col = nCols % 0x4U;
  256. #else
  257. /* Initialize col with number of samples */
  258. col = nCols;
  259. #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  260. while (col > 0U)
  261. {
  262. /* Read and store input element in destination */
  263. *pOut = *pIn++;
  264. /* Update pointer pOut to point to next row of transposed matrix */
  265. pOut += nRows;
  266. /* Decrement column loop counter */
  267. col--;
  268. }
  269. i++;
  270. /* Decrement row loop counter */
  271. row--;
  272. } while (row > 0U); /* row loop end */
  273. /* Set status as ARM_MATH_SUCCESS */
  274. status = ARM_MATH_SUCCESS;
  275. }
  276. /* Return to application */
  277. return (status);
  278. }
  279. #endif /* defined(ARM_MATH_MVEI) */
  280. /**
  281. @} end of MatrixTrans group
  282. */