arm_svm_sigmoid_predict_f32.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_svm_sigmoid_predict_f32.c
  4. * Description: SVM Sigmoid Classifier
  5. *
  6. *
  7. * Target Processor: Cortex-M and Cortex-A cores
  8. * -------------------------------------------------------------------- */
  9. /*
  10. * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  11. *
  12. * SPDX-License-Identifier: Apache-2.0
  13. *
  14. * Licensed under the Apache License, Version 2.0 (the License); you may
  15. * not use this file except in compliance with the License.
  16. * You may obtain a copy of the License at
  17. *
  18. * www.apache.org/licenses/LICENSE-2.0
  19. *
  20. * Unless required by applicable law or agreed to in writing, software
  21. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  22. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  23. * See the License for the specific language governing permissions and
  24. * limitations under the License.
  25. */
  26. #include "arm_math.h"
  27. #include <limits.h>
  28. #include <math.h>
  29. /**
  30. * @addtogroup groupSVM
  31. * @{
  32. */
  33. /**
  34. * @brief SVM sigmoid prediction
  35. * @param[in] S Pointer to an instance of the rbf SVM structure.
  36. * @param[in] in Pointer to input vector
  37. * @param[out] pResult Decision value
  38. * @return none.
  39. *
  40. */
  41. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  42. #include "arm_helium_utils.h"
  43. #include "arm_vec_math.h"
  44. void arm_svm_sigmoid_predict_f32(
  45. const arm_svm_sigmoid_instance_f32 *S,
  46. const float32_t * in,
  47. int32_t * pResult)
  48. {
  49. /* inlined Matrix x Vector function interleaved with dot prod */
  50. uint32_t numRows = S->nbOfSupportVectors;
  51. uint32_t numCols = S->vectorDimension;
  52. const float32_t *pSupport = S->supportVectors;
  53. const float32_t *pSrcA = pSupport;
  54. const float32_t *pInA0;
  55. const float32_t *pInA1;
  56. uint32_t row;
  57. uint32_t blkCnt; /* loop counters */
  58. const float32_t *pDualCoef = S->dualCoefficients;
  59. float32_t sum = S->intercept;
  60. f32x4_t vSum = vdupq_n_f32(0.0f);
  61. row = numRows;
  62. /*
  63. * compute 4 rows in parrallel
  64. */
  65. while (row >= 4) {
  66. const float32_t *pInA2, *pInA3;
  67. float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
  68. f32x4_t vecIn, acc0, acc1, acc2, acc3;
  69. float32_t const *pSrcVecPtr = in;
  70. /*
  71. * Initialize the pointers to 4 consecutive MatrixA rows
  72. */
  73. pInA0 = pSrcA;
  74. pInA1 = pInA0 + numCols;
  75. pInA2 = pInA1 + numCols;
  76. pInA3 = pInA2 + numCols;
  77. /*
  78. * Initialize the vector pointer
  79. */
  80. pInVec = pSrcVecPtr;
  81. /*
  82. * reset accumulators
  83. */
  84. acc0 = vdupq_n_f32(0.0f);
  85. acc1 = vdupq_n_f32(0.0f);
  86. acc2 = vdupq_n_f32(0.0f);
  87. acc3 = vdupq_n_f32(0.0f);
  88. pSrcA0Vec = pInA0;
  89. pSrcA1Vec = pInA1;
  90. pSrcA2Vec = pInA2;
  91. pSrcA3Vec = pInA3;
  92. blkCnt = numCols >> 2;
  93. while (blkCnt > 0U) {
  94. f32x4_t vecA;
  95. vecIn = vld1q(pInVec);
  96. pInVec += 4;
  97. vecA = vld1q(pSrcA0Vec);
  98. pSrcA0Vec += 4;
  99. acc0 = vfmaq(acc0, vecIn, vecA);
  100. vecA = vld1q(pSrcA1Vec);
  101. pSrcA1Vec += 4;
  102. acc1 = vfmaq(acc1, vecIn, vecA);
  103. vecA = vld1q(pSrcA2Vec);
  104. pSrcA2Vec += 4;
  105. acc2 = vfmaq(acc2, vecIn, vecA);
  106. vecA = vld1q(pSrcA3Vec);
  107. pSrcA3Vec += 4;
  108. acc3 = vfmaq(acc3, vecIn, vecA);
  109. blkCnt--;
  110. }
  111. /*
  112. * tail
  113. * (will be merged thru tail predication)
  114. */
  115. blkCnt = numCols & 3;
  116. if (blkCnt > 0U) {
  117. mve_pred16_t p0 = vctp32q(blkCnt);
  118. f32x4_t vecA;
  119. vecIn = vldrwq_z_f32(pInVec, p0);
  120. vecA = vldrwq_z_f32(pSrcA0Vec, p0);
  121. acc0 = vfmaq(acc0, vecIn, vecA);
  122. vecA = vldrwq_z_f32(pSrcA1Vec, p0);
  123. acc1 = vfmaq(acc1, vecIn, vecA);
  124. vecA = vldrwq_z_f32(pSrcA2Vec, p0);
  125. acc2 = vfmaq(acc2, vecIn, vecA);
  126. vecA = vldrwq_z_f32(pSrcA3Vec, p0);
  127. acc3 = vfmaq(acc3, vecIn, vecA);
  128. }
  129. /*
  130. * Sum the partial parts
  131. */
  132. f32x4_t vtmp = vuninitializedq_f32();
  133. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
  134. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
  135. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
  136. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
  137. vSum =
  138. vfmaq_f32(vSum, vld1q(pDualCoef),
  139. vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
  140. pDualCoef += 4;
  141. pSrcA += numCols * 4;
  142. /*
  143. * Decrement the row loop counter
  144. */
  145. row -= 4;
  146. }
  147. /*
  148. * compute 2 rows in parrallel
  149. */
  150. if (row >= 2) {
  151. float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
  152. f32x4_t vecIn, acc0, acc1;
  153. float32_t const *pSrcVecPtr = in;
  154. /*
  155. * Initialize the pointers to 2 consecutive MatrixA rows
  156. */
  157. pInA0 = pSrcA;
  158. pInA1 = pInA0 + numCols;
  159. /*
  160. * Initialize the vector pointer
  161. */
  162. pInVec = pSrcVecPtr;
  163. /*
  164. * reset accumulators
  165. */
  166. acc0 = vdupq_n_f32(0.0f);
  167. acc1 = vdupq_n_f32(0.0f);
  168. pSrcA0Vec = pInA0;
  169. pSrcA1Vec = pInA1;
  170. blkCnt = numCols >> 2;
  171. while (blkCnt > 0U) {
  172. f32x4_t vecA;
  173. vecIn = vld1q(pInVec);
  174. pInVec += 4;
  175. vecA = vld1q(pSrcA0Vec);
  176. pSrcA0Vec += 4;
  177. acc0 = vfmaq(acc0, vecIn, vecA);
  178. vecA = vld1q(pSrcA1Vec);
  179. pSrcA1Vec += 4;
  180. acc1 = vfmaq(acc1, vecIn, vecA);
  181. blkCnt--;
  182. }
  183. /*
  184. * tail
  185. * (will be merged thru tail predication)
  186. */
  187. blkCnt = numCols & 3;
  188. if (blkCnt > 0U) {
  189. mve_pred16_t p0 = vctp32q(blkCnt);
  190. f32x4_t vecA;
  191. vecIn = vldrwq_z_f32(pInVec, p0);
  192. vecA = vldrwq_z_f32(pSrcA0Vec, p0);
  193. acc0 = vfmaq(acc0, vecIn, vecA);
  194. vecA = vldrwq_z_f32(pSrcA1Vec, p0);
  195. acc1 = vfmaq(acc1, vecIn, vecA);
  196. }
  197. /*
  198. * Sum the partial parts
  199. */
  200. f32x4_t vtmp = vuninitializedq_f32();
  201. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
  202. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
  203. vSum =
  204. vfmaq_m_f32(vSum, vld1q(pDualCoef),
  205. vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
  206. vctp32q(2));
  207. pSrcA += numCols * 2;
  208. row -= 2;
  209. }
  210. if (row >= 1) {
  211. f32x4_t vecIn, acc0;
  212. float32_t const *pSrcA0Vec, *pInVec;
  213. float32_t const *pSrcVecPtr = in;
  214. /*
  215. * Initialize the pointers to last MatrixA row
  216. */
  217. pInA0 = pSrcA;
  218. /*
  219. * Initialize the vector pointer
  220. */
  221. pInVec = pSrcVecPtr;
  222. /*
  223. * reset accumulators
  224. */
  225. acc0 = vdupq_n_f32(0.0f);
  226. pSrcA0Vec = pInA0;
  227. blkCnt = numCols >> 2;
  228. while (blkCnt > 0U) {
  229. f32x4_t vecA;
  230. vecIn = vld1q(pInVec);
  231. pInVec += 4;
  232. vecA = vld1q(pSrcA0Vec);
  233. pSrcA0Vec += 4;
  234. acc0 = vfmaq(acc0, vecIn, vecA);
  235. blkCnt--;
  236. }
  237. /*
  238. * tail
  239. * (will be merged thru tail predication)
  240. */
  241. blkCnt = numCols & 3;
  242. if (blkCnt > 0U) {
  243. mve_pred16_t p0 = vctp32q(blkCnt);
  244. f32x4_t vecA;
  245. vecIn = vldrwq_z_f32(pInVec, p0);
  246. vecA = vldrwq_z_f32(pSrcA0Vec, p0);
  247. acc0 = vfmaq(acc0, vecIn, vecA);
  248. }
  249. /*
  250. * Sum the partial parts
  251. */
  252. f32x4_t vtmp = vuninitializedq_f32();
  253. vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
  254. vSum =
  255. vfmaq_m_f32(vSum, vld1q(pDualCoef),
  256. vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
  257. vctp32q(1));
  258. }
  259. sum += vecAddAcrossF32Mve(vSum);
  260. *pResult = S->classes[STEP(sum)];
  261. }
  262. #else
  263. #if defined(ARM_MATH_NEON)
  264. #include "NEMath.h"
  265. void arm_svm_sigmoid_predict_f32(
  266. const arm_svm_sigmoid_instance_f32 *S,
  267. const float32_t * in,
  268. int32_t * pResult)
  269. {
  270. float32_t sum = S->intercept;
  271. float32_t dot;
  272. float32x4_t dotV;
  273. float32x4_t accuma,accumb,accumc,accumd,accum;
  274. float32x2_t accum2;
  275. float32x4_t vec1;
  276. float32x4_t coef0 = vdupq_n_f32(S->coef0);
  277. float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
  278. uint32_t blkCnt;
  279. uint32_t vectorBlkCnt;
  280. const float32_t *pIn = in;
  281. const float32_t *pSupport = S->supportVectors;
  282. const float32_t *pSupporta = S->supportVectors;
  283. const float32_t *pSupportb;
  284. const float32_t *pSupportc;
  285. const float32_t *pSupportd;
  286. pSupportb = pSupporta + S->vectorDimension;
  287. pSupportc = pSupportb + S->vectorDimension;
  288. pSupportd = pSupportc + S->vectorDimension;
  289. const float32_t *pDualCoefs = S->dualCoefficients;
  290. vectorBlkCnt = S->nbOfSupportVectors >> 2;
  291. while (vectorBlkCnt > 0U)
  292. {
  293. accuma = vdupq_n_f32(0);
  294. accumb = vdupq_n_f32(0);
  295. accumc = vdupq_n_f32(0);
  296. accumd = vdupq_n_f32(0);
  297. pIn = in;
  298. blkCnt = S->vectorDimension >> 2;
  299. while (blkCnt > 0U)
  300. {
  301. vec1 = vld1q_f32(pIn);
  302. vec2a = vld1q_f32(pSupporta);
  303. vec2b = vld1q_f32(pSupportb);
  304. vec2c = vld1q_f32(pSupportc);
  305. vec2d = vld1q_f32(pSupportd);
  306. pIn += 4;
  307. pSupporta += 4;
  308. pSupportb += 4;
  309. pSupportc += 4;
  310. pSupportd += 4;
  311. accuma = vmlaq_f32(accuma, vec1,vec2a);
  312. accumb = vmlaq_f32(accumb, vec1,vec2b);
  313. accumc = vmlaq_f32(accumc, vec1,vec2c);
  314. accumd = vmlaq_f32(accumd, vec1,vec2d);
  315. blkCnt -- ;
  316. }
  317. accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
  318. dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
  319. accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
  320. dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
  321. accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
  322. dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
  323. accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
  324. dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
  325. blkCnt = S->vectorDimension & 3;
  326. while (blkCnt > 0U)
  327. {
  328. dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
  329. dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
  330. dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
  331. dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
  332. pIn++;
  333. blkCnt -- ;
  334. }
  335. vec1 = vld1q_f32(pDualCoefs);
  336. pDualCoefs += 4;
  337. // To vectorize later
  338. dotV = vmulq_n_f32(dotV, S->gamma);
  339. dotV = vaddq_f32(dotV, coef0);
  340. dotV = vtanhq_f32(dotV);
  341. accum = vmulq_f32(vec1,dotV);
  342. accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
  343. sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
  344. pSupporta += 3*S->vectorDimension;
  345. pSupportb += 3*S->vectorDimension;
  346. pSupportc += 3*S->vectorDimension;
  347. pSupportd += 3*S->vectorDimension;
  348. vectorBlkCnt -- ;
  349. }
  350. pSupport = pSupporta;
  351. vectorBlkCnt = S->nbOfSupportVectors & 3;
  352. while (vectorBlkCnt > 0U)
  353. {
  354. accum = vdupq_n_f32(0);
  355. dot = 0.0f;
  356. pIn = in;
  357. blkCnt = S->vectorDimension >> 2;
  358. while (blkCnt > 0U)
  359. {
  360. vec1 = vld1q_f32(pIn);
  361. vec2 = vld1q_f32(pSupport);
  362. pIn += 4;
  363. pSupport += 4;
  364. accum = vmlaq_f32(accum, vec1,vec2);
  365. blkCnt -- ;
  366. }
  367. accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
  368. dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
  369. blkCnt = S->vectorDimension & 3;
  370. while (blkCnt > 0U)
  371. {
  372. dot = dot + *pIn++ * *pSupport++;
  373. blkCnt -- ;
  374. }
  375. sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
  376. vectorBlkCnt -- ;
  377. }
  378. *pResult=S->classes[STEP(sum)];
  379. }
  380. #else
  381. void arm_svm_sigmoid_predict_f32(
  382. const arm_svm_sigmoid_instance_f32 *S,
  383. const float32_t * in,
  384. int32_t * pResult)
  385. {
  386. float32_t sum=S->intercept;
  387. float32_t dot=0;
  388. uint32_t i,j;
  389. const float32_t *pSupport = S->supportVectors;
  390. for(i=0; i < S->nbOfSupportVectors; i++)
  391. {
  392. dot=0;
  393. for(j=0; j < S->vectorDimension; j++)
  394. {
  395. dot = dot + in[j]* *pSupport++;
  396. }
  397. sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
  398. }
  399. *pResult=S->classes[STEP(sum)];
  400. }
  401. #endif
  402. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  403. /**
  404. * @} end of groupSVM group
  405. */