arm_barycenter_f32.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_barycenter_f32.c
  4. * Description: Barycenter
  5. *
  6. *
  7. * Target Processor: Cortex-M and Cortex-A cores
  8. * -------------------------------------------------------------------- */
  9. /*
  10. * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  11. *
  12. * SPDX-License-Identifier: Apache-2.0
  13. *
  14. * Licensed under the Apache License, Version 2.0 (the License); you may
  15. * not use this file except in compliance with the License.
  16. * You may obtain a copy of the License at
  17. *
  18. * www.apache.org/licenses/LICENSE-2.0
  19. *
  20. * Unless required by applicable law or agreed to in writing, software
  21. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  22. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  23. * See the License for the specific language governing permissions and
  24. * limitations under the License.
  25. */
  26. #include "arm_math.h"
  27. #include <limits.h>
  28. #include <math.h>
  29. /**
  30. @ingroup groupSupport
  31. */
  32. /**
  33. * @brief Barycenter
  34. *
  35. *
  36. * @param[in] *in List of vectors
  37. * @param[in] *weights Weights of the vectors
  38. * @param[out] *out Barycenter
  39. * @param[in] nbVectors Number of vectors
  40. * @param[in] vecDim Dimension of space (vector dimension)
  41. * @return None
  42. *
  43. */
  44. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  45. void arm_barycenter_f32(const float32_t *in,
  46. const float32_t *weights,
  47. float32_t *out,
  48. uint32_t nbVectors,
  49. uint32_t vecDim)
  50. {
  51. const float32_t *pIn, *pW;
  52. const float32_t *pIn1, *pIn2, *pIn3, *pIn4;
  53. float32_t *pOut;
  54. uint32_t blkCntVector, blkCntSample;
  55. float32_t accum, w;
  56. blkCntVector = nbVectors;
  57. blkCntSample = vecDim;
  58. accum = 0.0f;
  59. pW = weights;
  60. pIn = in;
  61. arm_fill_f32(0.0f, out, vecDim);
  62. /* Sum */
  63. pIn1 = pIn;
  64. pIn2 = pIn1 + vecDim;
  65. pIn3 = pIn2 + vecDim;
  66. pIn4 = pIn3 + vecDim;
  67. blkCntVector = nbVectors >> 2;
  68. while (blkCntVector > 0)
  69. {
  70. f32x4_t outV, inV1, inV2, inV3, inV4;
  71. float32_t w1, w2, w3, w4;
  72. pOut = out;
  73. w1 = *pW++;
  74. w2 = *pW++;
  75. w3 = *pW++;
  76. w4 = *pW++;
  77. accum += w1 + w2 + w3 + w4;
  78. blkCntSample = vecDim >> 2;
  79. while (blkCntSample > 0) {
  80. outV = vld1q((const float32_t *) pOut);
  81. inV1 = vld1q(pIn1);
  82. inV2 = vld1q(pIn2);
  83. inV3 = vld1q(pIn3);
  84. inV4 = vld1q(pIn4);
  85. outV = vfmaq(outV, inV1, w1);
  86. outV = vfmaq(outV, inV2, w2);
  87. outV = vfmaq(outV, inV3, w3);
  88. outV = vfmaq(outV, inV4, w4);
  89. vst1q(pOut, outV);
  90. pOut += 4;
  91. pIn1 += 4;
  92. pIn2 += 4;
  93. pIn3 += 4;
  94. pIn4 += 4;
  95. blkCntSample--;
  96. }
  97. blkCntSample = vecDim & 3;
  98. while (blkCntSample > 0) {
  99. *pOut = *pOut + *pIn1++ * w1;
  100. *pOut = *pOut + *pIn2++ * w2;
  101. *pOut = *pOut + *pIn3++ * w3;
  102. *pOut = *pOut + *pIn4++ * w4;
  103. pOut++;
  104. blkCntSample--;
  105. }
  106. pIn1 += 3 * vecDim;
  107. pIn2 += 3 * vecDim;
  108. pIn3 += 3 * vecDim;
  109. pIn4 += 3 * vecDim;
  110. blkCntVector--;
  111. }
  112. pIn = pIn1;
  113. blkCntVector = nbVectors & 3;
  114. while (blkCntVector > 0)
  115. {
  116. f32x4_t inV, outV;
  117. pOut = out;
  118. w = *pW++;
  119. accum += w;
  120. blkCntSample = vecDim >> 2;
  121. while (blkCntSample > 0)
  122. {
  123. outV = vld1q_f32(pOut);
  124. inV = vld1q_f32(pIn);
  125. outV = vfmaq(outV, inV, w);
  126. vst1q_f32(pOut, outV);
  127. pOut += 4;
  128. pIn += 4;
  129. blkCntSample--;
  130. }
  131. blkCntSample = vecDim & 3;
  132. while (blkCntSample > 0)
  133. {
  134. *pOut = *pOut + *pIn++ * w;
  135. pOut++;
  136. blkCntSample--;
  137. }
  138. blkCntVector--;
  139. }
  140. /* Normalize */
  141. pOut = out;
  142. accum = 1.0f / accum;
  143. blkCntSample = vecDim >> 2;
  144. while (blkCntSample > 0)
  145. {
  146. f32x4_t tmp;
  147. tmp = vld1q((const float32_t *) pOut);
  148. tmp = vmulq(tmp, accum);
  149. vst1q(pOut, tmp);
  150. pOut += 4;
  151. blkCntSample--;
  152. }
  153. blkCntSample = vecDim & 3;
  154. while (blkCntSample > 0)
  155. {
  156. *pOut = *pOut * accum;
  157. pOut++;
  158. blkCntSample--;
  159. }
  160. }
  161. #else
  162. #if defined(ARM_MATH_NEON)
  163. #include "NEMath.h"
  164. void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
  165. {
  166. const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4;
  167. float32_t *pOut;
  168. uint32_t blkCntVector,blkCntSample;
  169. float32_t accum, w,w1,w2,w3,w4;
  170. float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4;
  171. blkCntVector = nbVectors;
  172. blkCntSample = vecDim;
  173. accum = 0.0f;
  174. pW = weights;
  175. pIn = in;
  176. /* Set counters to 0 */
  177. tmp = vdupq_n_f32(0.0f);
  178. pOut = out;
  179. blkCntSample = vecDim >> 2;
  180. while(blkCntSample > 0)
  181. {
  182. vst1q_f32(pOut, tmp);
  183. pOut += 4;
  184. blkCntSample--;
  185. }
  186. blkCntSample = vecDim & 3;
  187. while(blkCntSample > 0)
  188. {
  189. *pOut = 0.0f;
  190. pOut++;
  191. blkCntSample--;
  192. }
  193. /* Sum */
  194. pIn1 = pIn;
  195. pIn2 = pIn1 + vecDim;
  196. pIn3 = pIn2 + vecDim;
  197. pIn4 = pIn3 + vecDim;
  198. blkCntVector = nbVectors >> 2;
  199. while(blkCntVector > 0)
  200. {
  201. pOut = out;
  202. w1 = *pW++;
  203. w2 = *pW++;
  204. w3 = *pW++;
  205. w4 = *pW++;
  206. accum += w1 + w2 + w3 + w4;
  207. blkCntSample = vecDim >> 2;
  208. while(blkCntSample > 0)
  209. {
  210. outV = vld1q_f32(pOut);
  211. inV1 = vld1q_f32(pIn1);
  212. inV2 = vld1q_f32(pIn2);
  213. inV3 = vld1q_f32(pIn3);
  214. inV4 = vld1q_f32(pIn4);
  215. outV = vmlaq_n_f32(outV,inV1,w1);
  216. outV = vmlaq_n_f32(outV,inV2,w2);
  217. outV = vmlaq_n_f32(outV,inV3,w3);
  218. outV = vmlaq_n_f32(outV,inV4,w4);
  219. vst1q_f32(pOut, outV);
  220. pOut += 4;
  221. pIn1 += 4;
  222. pIn2 += 4;
  223. pIn3 += 4;
  224. pIn4 += 4;
  225. blkCntSample--;
  226. }
  227. blkCntSample = vecDim & 3;
  228. while(blkCntSample > 0)
  229. {
  230. *pOut = *pOut + *pIn1++ * w1;
  231. *pOut = *pOut + *pIn2++ * w2;
  232. *pOut = *pOut + *pIn3++ * w3;
  233. *pOut = *pOut + *pIn4++ * w4;
  234. pOut++;
  235. blkCntSample--;
  236. }
  237. pIn1 += 3*vecDim;
  238. pIn2 += 3*vecDim;
  239. pIn3 += 3*vecDim;
  240. pIn4 += 3*vecDim;
  241. blkCntVector--;
  242. }
  243. pIn = pIn1;
  244. blkCntVector = nbVectors & 3;
  245. while(blkCntVector > 0)
  246. {
  247. pOut = out;
  248. w = *pW++;
  249. accum += w;
  250. blkCntSample = vecDim >> 2;
  251. while(blkCntSample > 0)
  252. {
  253. outV = vld1q_f32(pOut);
  254. inV = vld1q_f32(pIn);
  255. outV = vmlaq_n_f32(outV,inV,w);
  256. vst1q_f32(pOut, outV);
  257. pOut += 4;
  258. pIn += 4;
  259. blkCntSample--;
  260. }
  261. blkCntSample = vecDim & 3;
  262. while(blkCntSample > 0)
  263. {
  264. *pOut = *pOut + *pIn++ * w;
  265. pOut++;
  266. blkCntSample--;
  267. }
  268. blkCntVector--;
  269. }
  270. /* Normalize */
  271. pOut = out;
  272. accum = 1.0f / accum;
  273. blkCntSample = vecDim >> 2;
  274. while(blkCntSample > 0)
  275. {
  276. tmp = vld1q_f32(pOut);
  277. tmp = vmulq_n_f32(tmp,accum);
  278. vst1q_f32(pOut, tmp);
  279. pOut += 4;
  280. blkCntSample--;
  281. }
  282. blkCntSample = vecDim & 3;
  283. while(blkCntSample > 0)
  284. {
  285. *pOut = *pOut * accum;
  286. pOut++;
  287. blkCntSample--;
  288. }
  289. }
  290. #else
  291. void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
  292. {
  293. const float32_t *pIn,*pW;
  294. float32_t *pOut;
  295. uint32_t blkCntVector,blkCntSample;
  296. float32_t accum, w;
  297. blkCntVector = nbVectors;
  298. blkCntSample = vecDim;
  299. accum = 0.0f;
  300. pW = weights;
  301. pIn = in;
  302. /* Set counters to 0 */
  303. blkCntSample = vecDim;
  304. pOut = out;
  305. while(blkCntSample > 0)
  306. {
  307. *pOut = 0.0f;
  308. pOut++;
  309. blkCntSample--;
  310. }
  311. /* Sum */
  312. while(blkCntVector > 0)
  313. {
  314. pOut = out;
  315. w = *pW++;
  316. accum += w;
  317. blkCntSample = vecDim;
  318. while(blkCntSample > 0)
  319. {
  320. *pOut = *pOut + *pIn++ * w;
  321. pOut++;
  322. blkCntSample--;
  323. }
  324. blkCntVector--;
  325. }
  326. /* Normalize */
  327. blkCntSample = vecDim;
  328. pOut = out;
  329. while(blkCntSample > 0)
  330. {
  331. *pOut = *pOut / accum;
  332. pOut++;
  333. blkCntSample--;
  334. }
  335. }
  336. #endif
  337. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  338. /**
  339. * @} end of groupSupport group
  340. */