arm_mat_trans_f32.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_mat_trans_f32.c
  4. * Description: Floating-point matrix transpose
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/matrix_functions.h"
  29. /**
  30. @ingroup groupMatrix
  31. */
  32. /**
  33. @defgroup MatrixTrans Matrix Transpose
  34. Tranposes a matrix.
  35. Transposing an <code>M x N</code> matrix flips it around the center diagonal and results in an <code>N x M</code> matrix.
  36. @par Transpose of a 3 x 3 matrix
  37. \f[
  38. \begin{pmatrix}
  39. a_{1,1} & a_{1,2} & a_{1,3} \\
  40. a_{2,1} & a_{2,2} & a_{2,3} \\
  41. a_{3,1} & a_{3,2} & a_{3,3} \\
  42. \end{pmatrix}^T
  43. =
  44. \begin{pmatrix}
  45. a_{1,1} & a_{2,1} & a_{3,1} \\
  46. a_{1,2} & a_{2,2} & a_{3,2} \\
  47. a_{1,3} & a_{2,3} & a_{3,3} \\
  48. \end{pmatrix}
  49. \f]
  50. */
  51. /**
  52. @addtogroup MatrixTrans
  53. @{
  54. */
  55. /**
  56. @brief Floating-point matrix transpose.
  57. @param[in] pSrc points to input matrix
  58. @param[out] pDst points to output matrix
  59. @return execution status
  60. - \ref ARM_MATH_SUCCESS : Operation successful
  61. - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  62. */
  63. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  64. #include "arm_helium_utils.h"
  65. arm_status arm_mat_trans_f32(
  66. const arm_matrix_instance_f32 * pSrc,
  67. arm_matrix_instance_f32 * pDst)
  68. {
  69. arm_status status; /* status of matrix transpose */
  70. #ifdef ARM_MATH_MATRIX_CHECK
  71. /* Check for matrix mismatch condition */
  72. if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
  73. {
  74. /* Set status as ARM_MATH_SIZE_MISMATCH */
  75. status = ARM_MATH_SIZE_MISMATCH;
  76. }
  77. else
  78. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  79. {
  80. if (pDst->numRows == pDst->numCols)
  81. {
  82. if (pDst->numCols == 2)
  83. return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
  84. if (pDst->numCols == 3)
  85. return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
  86. if (pDst->numCols == 4)
  87. return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
  88. }
  89. arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
  90. /* Set status as ARM_MATH_SUCCESS */
  91. status = ARM_MATH_SUCCESS;
  92. }
  93. /* Return to application */
  94. return (status);
  95. }
  96. #else
  97. #if defined(ARM_MATH_NEON)
  98. arm_status arm_mat_trans_f32(
  99. const arm_matrix_instance_f32 * pSrc,
  100. arm_matrix_instance_f32 * pDst)
  101. {
  102. float32_t *pIn = pSrc->pData; /* input data matrix pointer */
  103. float32_t *pOut = pDst->pData; /* output data matrix pointer */
  104. float32_t *px; /* Temporary output data matrix pointer */
  105. uint16_t nRows = pSrc->numRows; /* number of rows */
  106. uint16_t nColumns = pSrc->numCols; /* number of columns */
  107. uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
  108. arm_status status; /* status of matrix transpose */
  109. #ifdef ARM_MATH_MATRIX_CHECK
  110. /* Check for matrix mismatch condition */
  111. if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
  112. {
  113. /* Set status as ARM_MATH_SIZE_MISMATCH */
  114. status = ARM_MATH_SIZE_MISMATCH;
  115. }
  116. else
  117. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  118. {
  119. /* Matrix transpose by exchanging the rows with columns */
  120. /* Row loop */
  121. rowCnt = row >> 2;
  122. while (rowCnt > 0U)
  123. {
  124. float32x4_t row0V,row1V,row2V,row3V;
  125. float32x4x2_t ra0,ra1,rb0,rb1;
  126. blkCnt = nColumns >> 2;
  127. /* The pointer px is set to starting address of the column being processed */
  128. px = pOut + i;
  129. /* Compute 4 outputs at a time.
  130. ** a second loop below computes the remaining 1 to 3 samples. */
  131. while (blkCnt > 0U) /* Column loop */
  132. {
  133. row0V = vld1q_f32(pIn);
  134. row1V = vld1q_f32(pIn + 1 * nColumns);
  135. row2V = vld1q_f32(pIn + 2 * nColumns);
  136. row3V = vld1q_f32(pIn + 3 * nColumns);
  137. pIn += 4;
  138. ra0 = vzipq_f32(row0V,row2V);
  139. ra1 = vzipq_f32(row1V,row3V);
  140. rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
  141. rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
  142. vst1q_f32(px,rb0.val[0]);
  143. px += nRows;
  144. vst1q_f32(px,rb0.val[1]);
  145. px += nRows;
  146. vst1q_f32(px,rb1.val[0]);
  147. px += nRows;
  148. vst1q_f32(px,rb1.val[1]);
  149. px += nRows;
  150. /* Decrement the column loop counter */
  151. blkCnt--;
  152. }
  153. /* Perform matrix transpose for last 3 samples here. */
  154. blkCnt = nColumns % 0x4U;
  155. while (blkCnt > 0U)
  156. {
  157. /* Read and store the input element in the destination */
  158. *px++ = *pIn;
  159. *px++ = *(pIn + 1 * nColumns);
  160. *px++ = *(pIn + 2 * nColumns);
  161. *px++ = *(pIn + 3 * nColumns);
  162. px += (nRows - 4);
  163. pIn++;
  164. /* Decrement the column loop counter */
  165. blkCnt--;
  166. }
  167. i += 4;
  168. pIn += 3 * nColumns;
  169. /* Decrement the row loop counter */
  170. rowCnt--;
  171. } /* Row loop end */
  172. rowCnt = row & 3;
  173. while (rowCnt > 0U)
  174. {
  175. blkCnt = nColumns ;
  176. /* The pointer px is set to starting address of the column being processed */
  177. px = pOut + i;
  178. while (blkCnt > 0U)
  179. {
  180. /* Read and store the input element in the destination */
  181. *px = *pIn++;
  182. /* Update the pointer px to point to the next row of the transposed matrix */
  183. px += nRows;
  184. /* Decrement the column loop counter */
  185. blkCnt--;
  186. }
  187. i++;
  188. rowCnt -- ;
  189. }
  190. /* Set status as ARM_MATH_SUCCESS */
  191. status = ARM_MATH_SUCCESS;
  192. }
  193. /* Return to application */
  194. return (status);
  195. }
  196. #else
  197. arm_status arm_mat_trans_f32(
  198. const arm_matrix_instance_f32 * pSrc,
  199. arm_matrix_instance_f32 * pDst)
  200. {
  201. float32_t *pIn = pSrc->pData; /* input data matrix pointer */
  202. float32_t *pOut = pDst->pData; /* output data matrix pointer */
  203. float32_t *px; /* Temporary output data matrix pointer */
  204. uint16_t nRows = pSrc->numRows; /* number of rows */
  205. uint16_t nCols = pSrc->numCols; /* number of columns */
  206. uint32_t col, row = nRows, i = 0U; /* Loop counters */
  207. arm_status status; /* status of matrix transpose */
  208. #ifdef ARM_MATH_MATRIX_CHECK
  209. /* Check for matrix mismatch condition */
  210. if ((pSrc->numRows != pDst->numCols) ||
  211. (pSrc->numCols != pDst->numRows) )
  212. {
  213. /* Set status as ARM_MATH_SIZE_MISMATCH */
  214. status = ARM_MATH_SIZE_MISMATCH;
  215. }
  216. else
  217. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  218. {
  219. /* Matrix transpose by exchanging the rows with columns */
  220. /* row loop */
  221. do
  222. {
  223. /* Pointer px is set to starting address of column being processed */
  224. px = pOut + i;
  225. #if defined (ARM_MATH_LOOPUNROLL)
  226. /* Loop unrolling: Compute 4 outputs at a time */
  227. col = nCols >> 2U;
  228. while (col > 0U) /* column loop */
  229. {
  230. /* Read and store input element in destination */
  231. *px = *pIn++;
  232. /* Update pointer px to point to next row of transposed matrix */
  233. px += nRows;
  234. *px = *pIn++;
  235. px += nRows;
  236. *px = *pIn++;
  237. px += nRows;
  238. *px = *pIn++;
  239. px += nRows;
  240. /* Decrement column loop counter */
  241. col--;
  242. }
  243. /* Loop unrolling: Compute remaining outputs */
  244. col = nCols % 0x4U;
  245. #else
  246. /* Initialize col with number of samples */
  247. col = nCols;
  248. #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  249. while (col > 0U)
  250. {
  251. /* Read and store input element in destination */
  252. *px = *pIn++;
  253. /* Update pointer px to point to next row of transposed matrix */
  254. px += nRows;
  255. /* Decrement column loop counter */
  256. col--;
  257. }
  258. i++;
  259. /* Decrement row loop counter */
  260. row--;
  261. } while (row > 0U); /* row loop end */
  262. /* Set status as ARM_MATH_SUCCESS */
  263. status = ARM_MATH_SUCCESS;
  264. }
  265. /* Return to application */
  266. return (status);
  267. }
  268. #endif /* #if defined(ARM_MATH_NEON) */
  269. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  270. /**
  271. * @} end of MatrixTrans group
  272. */