arm_mat_sub_f32.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_mat_sub_f32.c
  4. * Description: Floating-point matrix subtraction
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/matrix_functions.h"
  29. /**
  30. @ingroup groupMatrix
  31. */
  32. /**
  33. @defgroup MatrixSub Matrix Subtraction
  34. Subtract two matrices.
  35. \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
  36. The functions check to make sure that
  37. <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
  38. number of rows and columns.
  39. */
  40. /**
  41. @addtogroup MatrixSub
  42. @{
  43. */
  44. /**
  45. @brief Floating-point matrix subtraction.
  46. @param[in] pSrcA points to the first input matrix structure
  47. @param[in] pSrcB points to the second input matrix structure
  48. @param[out] pDst points to output matrix structure
  49. @return execution status
  50. - \ref ARM_MATH_SUCCESS : Operation successful
  51. - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
  52. */
  53. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  54. arm_status arm_mat_sub_f32(
  55. const arm_matrix_instance_f32 * pSrcA,
  56. const arm_matrix_instance_f32 * pSrcB,
  57. arm_matrix_instance_f32 * pDst)
  58. {
  59. arm_status status; /* status of matrix subtraction */
  60. uint32_t numSamples; /* total number of elements in the matrix */
  61. float32_t *pDataA, *pDataB, *pDataDst;
  62. f32x4_t vecA, vecB, vecDst;
  63. float32_t const *pSrcAVec;
  64. float32_t const *pSrcBVec;
  65. uint32_t blkCnt; /* loop counters */
  66. pDataA = pSrcA->pData;
  67. pDataB = pSrcB->pData;
  68. pDataDst = pDst->pData;
  69. pSrcAVec = (float32_t const *) pDataA;
  70. pSrcBVec = (float32_t const *) pDataB;
  71. #ifdef ARM_MATH_MATRIX_CHECK
  72. /* Check for matrix mismatch condition */
  73. if ((pSrcA->numRows != pSrcB->numRows) ||
  74. (pSrcA->numCols != pSrcB->numCols) ||
  75. (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
  76. {
  77. /* Set status as ARM_MATH_SIZE_MISMATCH */
  78. status = ARM_MATH_SIZE_MISMATCH;
  79. }
  80. else
  81. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  82. {
  83. /*
  84. * Total number of samples in the input matrix
  85. */
  86. numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
  87. blkCnt = numSamples >> 2;
  88. while (blkCnt > 0U)
  89. {
  90. /* C(m,n) = A(m,n) + B(m,n) */
  91. /* sub and then store the results in the destination buffer. */
  92. vecA = vld1q(pSrcAVec);
  93. pSrcAVec += 4;
  94. vecB = vld1q(pSrcBVec);
  95. pSrcBVec += 4;
  96. vecDst = vsubq(vecA, vecB);
  97. vst1q(pDataDst, vecDst);
  98. pDataDst += 4;
  99. /*
  100. * Decrement the blockSize loop counter
  101. */
  102. blkCnt--;
  103. }
  104. /*
  105. * tail
  106. * (will be merged thru tail predication)
  107. */
  108. blkCnt = numSamples & 3;
  109. if (blkCnt > 0U)
  110. {
  111. mve_pred16_t p0 = vctp32q(blkCnt);
  112. vecA = vld1q(pSrcAVec);
  113. vecB = vld1q(pSrcBVec);
  114. vecDst = vsubq_m(vecDst, vecA, vecB, p0);
  115. vstrwq_p(pDataDst, vecDst, p0);
  116. }
  117. status = ARM_MATH_SUCCESS;
  118. }
  119. /* Return to application */
  120. return (status);
  121. }
  122. #else
  123. #if defined(ARM_MATH_NEON)
  124. arm_status arm_mat_sub_f32(
  125. const arm_matrix_instance_f32 * pSrcA,
  126. const arm_matrix_instance_f32 * pSrcB,
  127. arm_matrix_instance_f32 * pDst)
  128. {
  129. float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
  130. float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
  131. float32_t *pOut = pDst->pData; /* output data matrix pointer */
  132. uint32_t numSamples; /* total number of elements in the matrix */
  133. uint32_t blkCnt; /* loop counters */
  134. arm_status status; /* status of matrix subtraction */
  135. #ifdef ARM_MATH_MATRIX_CHECK
  136. /* Check for matrix mismatch condition */
  137. if ((pSrcA->numRows != pSrcB->numRows) ||
  138. (pSrcA->numCols != pSrcB->numCols) ||
  139. (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
  140. {
  141. /* Set status as ARM_MATH_SIZE_MISMATCH */
  142. status = ARM_MATH_SIZE_MISMATCH;
  143. }
  144. else
  145. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  146. {
  147. float32x4_t vec1;
  148. float32x4_t vec2;
  149. float32x4_t res;
  150. /* Total number of samples in the input matrix */
  151. numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
  152. blkCnt = numSamples >> 2U;
  153. /* Compute 4 outputs at a time.
  154. ** a second loop below computes the remaining 1 to 3 samples. */
  155. while (blkCnt > 0U)
  156. {
  157. /* C(m,n) = A(m,n) - B(m,n) */
  158. /* Subtract and then store the results in the destination buffer. */
  159. /* Read values from source A */
  160. vec1 = vld1q_f32(pIn1);
  161. vec2 = vld1q_f32(pIn2);
  162. res = vsubq_f32(vec1, vec2);
  163. vst1q_f32(pOut, res);
  164. /* Update pointers to process next samples */
  165. pIn1 += 4U;
  166. pIn2 += 4U;
  167. pOut += 4U;
  168. /* Decrement the loop counter */
  169. blkCnt--;
  170. }
  171. /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
  172. ** No loop unrolling is used. */
  173. blkCnt = numSamples % 0x4U;
  174. while (blkCnt > 0U)
  175. {
  176. /* C(m,n) = A(m,n) - B(m,n) */
  177. /* Subtract and then store the results in the destination buffer. */
  178. *pOut++ = (*pIn1++) - (*pIn2++);
  179. /* Decrement the loop counter */
  180. blkCnt--;
  181. }
  182. /* Set status as ARM_MATH_SUCCESS */
  183. status = ARM_MATH_SUCCESS;
  184. }
  185. /* Return to application */
  186. return (status);
  187. }
  188. #else
  189. arm_status arm_mat_sub_f32(
  190. const arm_matrix_instance_f32 * pSrcA,
  191. const arm_matrix_instance_f32 * pSrcB,
  192. arm_matrix_instance_f32 * pDst)
  193. {
  194. float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
  195. float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
  196. float32_t *pOut = pDst->pData; /* output data matrix pointer */
  197. uint32_t numSamples; /* total number of elements in the matrix */
  198. uint32_t blkCnt; /* loop counters */
  199. arm_status status; /* status of matrix subtraction */
  200. #ifdef ARM_MATH_MATRIX_CHECK
  201. /* Check for matrix mismatch condition */
  202. if ((pSrcA->numRows != pSrcB->numRows) ||
  203. (pSrcA->numCols != pSrcB->numCols) ||
  204. (pSrcA->numRows != pDst->numRows) ||
  205. (pSrcA->numCols != pDst->numCols) )
  206. {
  207. /* Set status as ARM_MATH_SIZE_MISMATCH */
  208. status = ARM_MATH_SIZE_MISMATCH;
  209. }
  210. else
  211. #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
  212. {
  213. /* Total number of samples in input matrix */
  214. numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
  215. #if defined (ARM_MATH_LOOPUNROLL)
  216. /* Loop unrolling: Compute 4 outputs at a time */
  217. blkCnt = numSamples >> 2U;
  218. while (blkCnt > 0U)
  219. {
  220. /* C(m,n) = A(m,n) - B(m,n) */
  221. /* Subtract and store result in destination buffer. */
  222. *pOut++ = (*pInA++) - (*pInB++);
  223. *pOut++ = (*pInA++) - (*pInB++);
  224. *pOut++ = (*pInA++) - (*pInB++);
  225. *pOut++ = (*pInA++) - (*pInB++);
  226. /* Decrement loop counter */
  227. blkCnt--;
  228. }
  229. /* Loop unrolling: Compute remaining outputs */
  230. blkCnt = numSamples % 0x4U;
  231. #else
  232. /* Initialize blkCnt with number of samples */
  233. blkCnt = numSamples;
  234. #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
  235. while (blkCnt > 0U)
  236. {
  237. /* C(m,n) = A(m,n) - B(m,n) */
  238. /* Subtract and store result in destination buffer. */
  239. *pOut++ = (*pInA++) - (*pInB++);
  240. /* Decrement loop counter */
  241. blkCnt--;
  242. }
  243. /* Set status as ARM_MATH_SUCCESS */
  244. status = ARM_MATH_SUCCESS;
  245. }
  246. /* Return to application */
  247. return (status);
  248. }
  249. #endif /* #if defined(ARM_MATH_NEON) */
  250. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  251. /**
  252. @} end of MatrixSub group
  253. */