arm_barycenter_f32.c 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_barycenter_f32.c
  4. * Description: Barycenter
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/support_functions.h"
  29. #include <limits.h>
  30. #include <math.h>
  31. /**
  32. @ingroup barycenter
  33. */
  34. /**
  35. * @brief Barycenter
  36. *
  37. *
  38. * @param[in] *in List of vectors
  39. * @param[in] *weights Weights of the vectors
  40. * @param[out] *out Barycenter
  41. * @param[in] nbVectors Number of vectors
  42. * @param[in] vecDim Dimension of space (vector dimension)
  43. * @return None
  44. *
  45. */
  46. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  47. void arm_barycenter_f32(const float32_t *in,
  48. const float32_t *weights,
  49. float32_t *out,
  50. uint32_t nbVectors,
  51. uint32_t vecDim)
  52. {
  53. const float32_t *pIn, *pW;
  54. const float32_t *pIn1, *pIn2, *pIn3, *pIn4;
  55. float32_t *pOut;
  56. uint32_t blkCntVector, blkCntSample;
  57. float32_t accum, w;
  58. blkCntVector = nbVectors;
  59. blkCntSample = vecDim;
  60. accum = 0.0f;
  61. pW = weights;
  62. pIn = in;
  63. arm_fill_f32(0.0f, out, vecDim);
  64. /* Sum */
  65. pIn1 = pIn;
  66. pIn2 = pIn1 + vecDim;
  67. pIn3 = pIn2 + vecDim;
  68. pIn4 = pIn3 + vecDim;
  69. blkCntVector = nbVectors >> 2;
  70. while (blkCntVector > 0)
  71. {
  72. f32x4_t outV, inV1, inV2, inV3, inV4;
  73. float32_t w1, w2, w3, w4;
  74. pOut = out;
  75. w1 = *pW++;
  76. w2 = *pW++;
  77. w3 = *pW++;
  78. w4 = *pW++;
  79. accum += w1 + w2 + w3 + w4;
  80. blkCntSample = vecDim >> 2;
  81. while (blkCntSample > 0) {
  82. outV = vld1q((const float32_t *) pOut);
  83. inV1 = vld1q(pIn1);
  84. inV2 = vld1q(pIn2);
  85. inV3 = vld1q(pIn3);
  86. inV4 = vld1q(pIn4);
  87. outV = vfmaq(outV, inV1, w1);
  88. outV = vfmaq(outV, inV2, w2);
  89. outV = vfmaq(outV, inV3, w3);
  90. outV = vfmaq(outV, inV4, w4);
  91. vst1q(pOut, outV);
  92. pOut += 4;
  93. pIn1 += 4;
  94. pIn2 += 4;
  95. pIn3 += 4;
  96. pIn4 += 4;
  97. blkCntSample--;
  98. }
  99. blkCntSample = vecDim & 3;
  100. while (blkCntSample > 0) {
  101. *pOut = *pOut + *pIn1++ * w1;
  102. *pOut = *pOut + *pIn2++ * w2;
  103. *pOut = *pOut + *pIn3++ * w3;
  104. *pOut = *pOut + *pIn4++ * w4;
  105. pOut++;
  106. blkCntSample--;
  107. }
  108. pIn1 += 3 * vecDim;
  109. pIn2 += 3 * vecDim;
  110. pIn3 += 3 * vecDim;
  111. pIn4 += 3 * vecDim;
  112. blkCntVector--;
  113. }
  114. pIn = pIn1;
  115. blkCntVector = nbVectors & 3;
  116. while (blkCntVector > 0)
  117. {
  118. f32x4_t inV, outV;
  119. pOut = out;
  120. w = *pW++;
  121. accum += w;
  122. blkCntSample = vecDim >> 2;
  123. while (blkCntSample > 0)
  124. {
  125. outV = vld1q_f32(pOut);
  126. inV = vld1q_f32(pIn);
  127. outV = vfmaq(outV, inV, w);
  128. vst1q_f32(pOut, outV);
  129. pOut += 4;
  130. pIn += 4;
  131. blkCntSample--;
  132. }
  133. blkCntSample = vecDim & 3;
  134. while (blkCntSample > 0)
  135. {
  136. *pOut = *pOut + *pIn++ * w;
  137. pOut++;
  138. blkCntSample--;
  139. }
  140. blkCntVector--;
  141. }
  142. /* Normalize */
  143. pOut = out;
  144. accum = 1.0f / accum;
  145. blkCntSample = vecDim >> 2;
  146. while (blkCntSample > 0)
  147. {
  148. f32x4_t tmp;
  149. tmp = vld1q((const float32_t *) pOut);
  150. tmp = vmulq(tmp, accum);
  151. vst1q(pOut, tmp);
  152. pOut += 4;
  153. blkCntSample--;
  154. }
  155. blkCntSample = vecDim & 3;
  156. while (blkCntSample > 0)
  157. {
  158. *pOut = *pOut * accum;
  159. pOut++;
  160. blkCntSample--;
  161. }
  162. }
  163. #else
  164. #if defined(ARM_MATH_NEON)
  165. #include "NEMath.h"
  166. void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
  167. {
  168. const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4;
  169. float32_t *pOut;
  170. uint32_t blkCntVector,blkCntSample;
  171. float32_t accum, w,w1,w2,w3,w4;
  172. float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4;
  173. blkCntVector = nbVectors;
  174. blkCntSample = vecDim;
  175. accum = 0.0f;
  176. pW = weights;
  177. pIn = in;
  178. /* Set counters to 0 */
  179. tmp = vdupq_n_f32(0.0f);
  180. pOut = out;
  181. blkCntSample = vecDim >> 2;
  182. while(blkCntSample > 0)
  183. {
  184. vst1q_f32(pOut, tmp);
  185. pOut += 4;
  186. blkCntSample--;
  187. }
  188. blkCntSample = vecDim & 3;
  189. while(blkCntSample > 0)
  190. {
  191. *pOut = 0.0f;
  192. pOut++;
  193. blkCntSample--;
  194. }
  195. /* Sum */
  196. pIn1 = pIn;
  197. pIn2 = pIn1 + vecDim;
  198. pIn3 = pIn2 + vecDim;
  199. pIn4 = pIn3 + vecDim;
  200. blkCntVector = nbVectors >> 2;
  201. while(blkCntVector > 0)
  202. {
  203. pOut = out;
  204. w1 = *pW++;
  205. w2 = *pW++;
  206. w3 = *pW++;
  207. w4 = *pW++;
  208. accum += w1 + w2 + w3 + w4;
  209. blkCntSample = vecDim >> 2;
  210. while(blkCntSample > 0)
  211. {
  212. outV = vld1q_f32(pOut);
  213. inV1 = vld1q_f32(pIn1);
  214. inV2 = vld1q_f32(pIn2);
  215. inV3 = vld1q_f32(pIn3);
  216. inV4 = vld1q_f32(pIn4);
  217. outV = vmlaq_n_f32(outV,inV1,w1);
  218. outV = vmlaq_n_f32(outV,inV2,w2);
  219. outV = vmlaq_n_f32(outV,inV3,w3);
  220. outV = vmlaq_n_f32(outV,inV4,w4);
  221. vst1q_f32(pOut, outV);
  222. pOut += 4;
  223. pIn1 += 4;
  224. pIn2 += 4;
  225. pIn3 += 4;
  226. pIn4 += 4;
  227. blkCntSample--;
  228. }
  229. blkCntSample = vecDim & 3;
  230. while(blkCntSample > 0)
  231. {
  232. *pOut = *pOut + *pIn1++ * w1;
  233. *pOut = *pOut + *pIn2++ * w2;
  234. *pOut = *pOut + *pIn3++ * w3;
  235. *pOut = *pOut + *pIn4++ * w4;
  236. pOut++;
  237. blkCntSample--;
  238. }
  239. pIn1 += 3*vecDim;
  240. pIn2 += 3*vecDim;
  241. pIn3 += 3*vecDim;
  242. pIn4 += 3*vecDim;
  243. blkCntVector--;
  244. }
  245. pIn = pIn1;
  246. blkCntVector = nbVectors & 3;
  247. while(blkCntVector > 0)
  248. {
  249. pOut = out;
  250. w = *pW++;
  251. accum += w;
  252. blkCntSample = vecDim >> 2;
  253. while(blkCntSample > 0)
  254. {
  255. outV = vld1q_f32(pOut);
  256. inV = vld1q_f32(pIn);
  257. outV = vmlaq_n_f32(outV,inV,w);
  258. vst1q_f32(pOut, outV);
  259. pOut += 4;
  260. pIn += 4;
  261. blkCntSample--;
  262. }
  263. blkCntSample = vecDim & 3;
  264. while(blkCntSample > 0)
  265. {
  266. *pOut = *pOut + *pIn++ * w;
  267. pOut++;
  268. blkCntSample--;
  269. }
  270. blkCntVector--;
  271. }
  272. /* Normalize */
  273. pOut = out;
  274. accum = 1.0f / accum;
  275. blkCntSample = vecDim >> 2;
  276. while(blkCntSample > 0)
  277. {
  278. tmp = vld1q_f32(pOut);
  279. tmp = vmulq_n_f32(tmp,accum);
  280. vst1q_f32(pOut, tmp);
  281. pOut += 4;
  282. blkCntSample--;
  283. }
  284. blkCntSample = vecDim & 3;
  285. while(blkCntSample > 0)
  286. {
  287. *pOut = *pOut * accum;
  288. pOut++;
  289. blkCntSample--;
  290. }
  291. }
  292. #else
  293. void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
  294. {
  295. const float32_t *pIn,*pW;
  296. float32_t *pOut;
  297. uint32_t blkCntVector,blkCntSample;
  298. float32_t accum, w;
  299. blkCntVector = nbVectors;
  300. blkCntSample = vecDim;
  301. accum = 0.0f;
  302. pW = weights;
  303. pIn = in;
  304. /* Set counters to 0 */
  305. blkCntSample = vecDim;
  306. pOut = out;
  307. while(blkCntSample > 0)
  308. {
  309. *pOut = 0.0f;
  310. pOut++;
  311. blkCntSample--;
  312. }
  313. /* Sum */
  314. while(blkCntVector > 0)
  315. {
  316. pOut = out;
  317. w = *pW++;
  318. accum += w;
  319. blkCntSample = vecDim;
  320. while(blkCntSample > 0)
  321. {
  322. *pOut = *pOut + *pIn++ * w;
  323. pOut++;
  324. blkCntSample--;
  325. }
  326. blkCntVector--;
  327. }
  328. /* Normalize */
  329. blkCntSample = vecDim;
  330. pOut = out;
  331. while(blkCntSample > 0)
  332. {
  333. *pOut = *pOut / accum;
  334. pOut++;
  335. blkCntSample--;
  336. }
  337. }
  338. #endif
  339. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  340. /**
  341. * @} end of barycenter group
  342. */