arm_mat_add_f32.c 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_mat_add_f32.c
  4. * Description: Floating-point matrix addition
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/matrix_functions.h"
  29. /**
  30. @ingroup groupMatrix
  31. */
  32. /**
  33. @defgroup MatrixAdd Matrix Addition
  34. Adds two matrices.
  35. @par Addition of two 3 x 3 matrices
  36. \f[
  37. \begin{pmatrix}
  38. a_{1,1} & a_{1,2} & a_{1,3} \\
  39. a_{2,1} & a_{2,2} & a_{2,3} \\
  40. a_{3,1} & a_{3,2} & a_{3,3} \\
  41. \end{pmatrix}
  42. +
  43. \begin{pmatrix}
  44. b_{1,1} & b_{1,2} & b_{1,3} \\
  45. b_{2,1} & b_{2,2} & b_{2,3} \\
  46. b_{3,1} & b_{3,2} & b_{3,3} \\
  47. \end{pmatrix}
  48. =
  49. \begin{pmatrix}
  50. a_{1,1}+b_{1,1} & a_{1,2}+b_{1,2} & a_{1,3}+b_{1,3} \\
  51. a_{2,1}+b_{2,1} & a_{2,2}+b_{2,2} & a_{2,3}+b_{2,3} \\
  52. a_{3,1}+b_{3,1} & a_{3,2}+b_{3,2} & a_{3,3}+b_{3,3} \\
  53. \end{pmatrix}
  54. \f]
  55. The functions check to make sure that
  56. <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
  57. number of rows and columns.
  58. */
  59. /**
  60. @addtogroup MatrixAdd
  61. @{
  62. */
  63. /**
  64. @brief Floating-point matrix addition.
  65. @param[in] pSrcA points to first input matrix structure
  66. @param[in] pSrcB points to second input matrix structure
  67. @param[out] pDst points to output matrix structure
  68. @return execution status
  69. - \ref ARM_MATH_SUCCESS : Operation successful
  70. - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  71. */
  72. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  73. arm_status arm_mat_add_f32(
  74. const arm_matrix_instance_f32 * pSrcA,
  75. const arm_matrix_instance_f32 * pSrcB,
  76. arm_matrix_instance_f32 * pDst)
  77. {
  78. arm_status status;
  79. uint32_t numSamples; /* total number of elements in the matrix */
  80. float32_t *pDataA, *pDataB, *pDataDst;
  81. f32x4_t vecA, vecB, vecDst = { 0 };
  82. float32_t const *pSrcAVec;
  83. float32_t const *pSrcBVec;
  84. uint32_t blkCnt; /* loop counters */
  85. pDataA = pSrcA->pData;
  86. pDataB = pSrcB->pData;
  87. pDataDst = pDst->pData;
  88. pSrcAVec = (float32_t const *) pDataA;
  89. pSrcBVec = (float32_t const *) pDataB;
  90. #ifdef ARM_MATH_MATRIX_CHECK
  91. /* Check for matrix mismatch condition */
  92. if ((pSrcA->numRows != pSrcB->numRows) ||
  93. (pSrcA->numCols != pSrcB->numCols) ||
  94. (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
  95. {
  96. /* Set status as ARM_MATH_SIZE_MISMATCH */
  97. status = ARM_MATH_SIZE_MISMATCH;
  98. }
  99. else
  100. #endif
  101. {
  102. /*
  103. * Total number of samples in the input matrix
  104. */
  105. numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
  106. blkCnt = numSamples >> 2;
  107. while (blkCnt > 0U)
  108. {
  109. /* C(m,n) = A(m,n) + B(m,n) */
  110. /* Add and then store the results in the destination buffer. */
  111. vecA = vld1q(pSrcAVec);
  112. pSrcAVec += 4;
  113. vecB = vld1q(pSrcBVec);
  114. pSrcBVec += 4;
  115. vecDst = vaddq(vecA, vecB);
  116. vst1q(pDataDst, vecDst);
  117. pDataDst += 4;
  118. /*
  119. * Decrement the blockSize loop counter
  120. */
  121. blkCnt--;
  122. }
  123. /*
  124. * tail
  125. */
  126. blkCnt = numSamples & 3;
  127. if (blkCnt > 0U)
  128. {
  129. mve_pred16_t p0 = vctp32q(blkCnt);
  130. vecA = vld1q(pSrcAVec);
  131. vecB = vld1q(pSrcBVec);
  132. vecDst = vaddq_m(vecDst, vecA, vecB, p0);
  133. vstrwq_p(pDataDst, vecDst, p0);
  134. }
  135. /* set status as ARM_MATH_SUCCESS */
  136. status = ARM_MATH_SUCCESS;
  137. }
  138. return (status);
  139. }
  140. #else
  141. #if defined(ARM_MATH_NEON)
  142. /*
  143. Neon version is assuming the matrix is small enough.
  144. So no blocking is used for taking into account cache effects.
  145. For big matrix, there exist better libraries for Neon.
  146. */
  147. arm_status arm_mat_add_f32(
  148. const arm_matrix_instance_f32 * pSrcA,
  149. const arm_matrix_instance_f32 * pSrcB,
  150. arm_matrix_instance_f32 * pDst)
  151. {
  152. float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
  153. float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
  154. float32_t *pOut = pDst->pData; /* output data matrix pointer */
  155. uint32_t numSamples; /* total number of elements in the matrix */
  156. uint32_t blkCnt; /* loop counters */
  157. arm_status status; /* status of matrix addition */
  158. #ifdef ARM_MATH_MATRIX_CHECK
  159. /* Check for matrix mismatch condition */
  160. if ((pSrcA->numRows != pSrcB->numRows) ||
  161. (pSrcA->numCols != pSrcB->numCols) ||
  162. (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
  163. {
  164. /* Set status as ARM_MATH_SIZE_MISMATCH */
  165. status = ARM_MATH_SIZE_MISMATCH;
  166. }
  167. else
  168. #endif
  169. {
  170. float32x4_t vec1;
  171. float32x4_t vec2;
  172. float32x4_t res;
  173. /* Total number of samples in the input matrix */
  174. numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
  175. blkCnt = numSamples >> 2U;
  176. /* Compute 4 outputs at a time.
  177. ** a second loop below computes the remaining 1 to 3 samples. */
  178. while (blkCnt > 0U)
  179. {
  180. /* C(m,n) = A(m,n) + B(m,n) */
  181. /* Add and then store the results in the destination buffer. */
  182. vec1 = vld1q_f32(pIn1);
  183. vec2 = vld1q_f32(pIn2);
  184. res = vaddq_f32(vec1, vec2);
  185. vst1q_f32(pOut, res);
  186. /* update pointers to process next samples */
  187. pIn1 += 4U;
  188. pIn2 += 4U;
  189. pOut += 4U;
  190. /* Decrement the loop counter */
  191. blkCnt--;
  192. }
  193. /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
  194. ** No loop unrolling is used. */
  195. blkCnt = numSamples % 0x4U;
  196. while (blkCnt > 0U)
  197. {
  198. /* C(m,n) = A(m,n) + B(m,n) */
  199. /* Add and then store the results in the destination buffer. */
  200. *pOut++ = (*pIn1++) + (*pIn2++);
  201. /* Decrement the loop counter */
  202. blkCnt--;
  203. }
  204. /* set status as ARM_MATH_SUCCESS */
  205. status = ARM_MATH_SUCCESS;
  206. }
  207. /* Return to application */
  208. return (status);
  209. }
  210. #else
  211. arm_status arm_mat_add_f32(
  212. const arm_matrix_instance_f32 * pSrcA,
  213. const arm_matrix_instance_f32 * pSrcB,
  214. arm_matrix_instance_f32 * pDst)
  215. {
  216. float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  217. float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
  218. float32_t *pOut = pDst->pData; /* output data matrix pointer */
  219. uint32_t numSamples; /* total number of elements in the matrix */
  220. uint32_t blkCnt; /* loop counters */
  221. arm_status status; /* status of matrix addition */
  222. #ifdef ARM_MATH_MATRIX_CHECK
  223. /* Check for matrix mismatch condition */
  224. if ((pSrcA->numRows != pSrcB->numRows) ||
  225. (pSrcA->numCols != pSrcB->numCols) ||
  226. (pSrcA->numRows != pDst->numRows) ||
  227. (pSrcA->numCols != pDst->numCols) )
  228. {
  229. /* Set status as ARM_MATH_SIZE_MISMATCH */
  230. status = ARM_MATH_SIZE_MISMATCH;
  231. }
  232. else
  233. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  234. {
  235. /* Total number of samples in input matrix */
  236. numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
  237. #if defined (ARM_MATH_LOOPUNROLL)
  238. /* Loop unrolling: Compute 4 outputs at a time */
  239. blkCnt = numSamples >> 2U;
  240. while (blkCnt > 0U)
  241. {
  242. /* C(m,n) = A(m,n) + B(m,n) */
  243. /* Add and store result in destination buffer. */
  244. *pOut++ = *pInA++ + *pInB++;
  245. *pOut++ = *pInA++ + *pInB++;
  246. *pOut++ = *pInA++ + *pInB++;
  247. *pOut++ = *pInA++ + *pInB++;
  248. /* Decrement loop counter */
  249. blkCnt--;
  250. }
  251. /* Loop unrolling: Compute remaining outputs */
  252. blkCnt = numSamples % 0x4U;
  253. #else
  254. /* Initialize blkCnt with number of samples */
  255. blkCnt = numSamples;
  256. #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  257. while (blkCnt > 0U)
  258. {
  259. /* C(m,n) = A(m,n) + B(m,n) */
  260. /* Add and store result in destination buffer. */
  261. *pOut++ = *pInA++ + *pInB++;
  262. /* Decrement loop counter */
  263. blkCnt--;
  264. }
  265. /* Set status as ARM_MATH_SUCCESS */
  266. status = ARM_MATH_SUCCESS;
  267. }
  268. /* Return to application */
  269. return (status);
  270. }
  271. #endif /* #if defined(ARM_MATH_NEON) */
  272. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  273. /**
  274. @} end of MatrixAdd group
  275. */