arm_svm_polynomial_predict_f16.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_svm_polynomial_predict_f16.c
  4. * Description: SVM Polynomial Classifier
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/svm_functions_f16.h"
  29. #if defined(ARM_FLOAT16_SUPPORTED)
  30. #include <limits.h>
  31. #include <math.h>
  32. #if !defined(ARM_MATH_MVE_FLOAT16) || defined(ARM_MATH_AUTOVECTORIZE)
  33. /*
  34. _Float16 is not supported in g++ so we avoid putting _Float16 definitions
  35. in the public headers.
  36. This function should at some point be moved in FastMath.
  37. */
  38. __STATIC_INLINE float16_t arm_exponent_f16(float16_t x, int32_t nb)
  39. {
  40. float16_t r = x;
  41. nb --;
  42. while(nb > 0)
  43. {
  44. r = (_Float16)r * (_Float16)x;
  45. nb--;
  46. }
  47. return(r);
  48. }
  49. #endif
  50. /**
  51. * @addtogroup polysvm
  52. * @{
  53. */
  54. #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
  55. #include "arm_helium_utils.h"
  56. #include "arm_vec_math_f16.h"
  57. /**
  58. * @brief SVM polynomial prediction
  59. * @param[in] S Pointer to an instance of the polynomial SVM structure.
  60. * @param[in] in Pointer to input vector
  61. * @param[out] pResult Decision value
  62. * @return none.
  63. *
  64. */
  65. void arm_svm_polynomial_predict_f16(
  66. const arm_svm_polynomial_instance_f16 *S,
  67. const float16_t * in,
  68. int32_t * pResult)
  69. {
  70. /* inlined Matrix x Vector function interleaved with dot prod */
  71. uint32_t numRows = S->nbOfSupportVectors;
  72. uint32_t numCols = S->vectorDimension;
  73. const float16_t *pSupport = S->supportVectors;
  74. const float16_t *pSrcA = pSupport;
  75. const float16_t *pInA0;
  76. const float16_t *pInA1;
  77. uint32_t row;
  78. uint32_t blkCnt; /* loop counters */
  79. const float16_t *pDualCoef = S->dualCoefficients;
  80. _Float16 sum = S->intercept;
  81. f16x8_t vSum = vdupq_n_f16(0.0f);
  82. row = numRows;
  83. /*
  84. * compute 4 rows in parrallel
  85. */
  86. while (row >= 4) {
  87. const float16_t *pInA2, *pInA3;
  88. float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
  89. f16x8_t vecIn, acc0, acc1, acc2, acc3;
  90. float16_t const *pSrcVecPtr = in;
  91. /*
  92. * Initialize the pointers to 4 consecutive MatrixA rows
  93. */
  94. pInA0 = pSrcA;
  95. pInA1 = pInA0 + numCols;
  96. pInA2 = pInA1 + numCols;
  97. pInA3 = pInA2 + numCols;
  98. /*
  99. * Initialize the vector pointer
  100. */
  101. pInVec = pSrcVecPtr;
  102. /*
  103. * reset accumulators
  104. */
  105. acc0 = vdupq_n_f16(0.0f);
  106. acc1 = vdupq_n_f16(0.0f);
  107. acc2 = vdupq_n_f16(0.0f);
  108. acc3 = vdupq_n_f16(0.0f);
  109. pSrcA0Vec = pInA0;
  110. pSrcA1Vec = pInA1;
  111. pSrcA2Vec = pInA2;
  112. pSrcA3Vec = pInA3;
  113. blkCnt = numCols >> 3;
  114. while (blkCnt > 0U) {
  115. f16x8_t vecA;
  116. vecIn = vld1q(pInVec);
  117. pInVec += 8;
  118. vecA = vld1q(pSrcA0Vec);
  119. pSrcA0Vec += 8;
  120. acc0 = vfmaq(acc0, vecIn, vecA);
  121. vecA = vld1q(pSrcA1Vec);
  122. pSrcA1Vec += 8;
  123. acc1 = vfmaq(acc1, vecIn, vecA);
  124. vecA = vld1q(pSrcA2Vec);
  125. pSrcA2Vec += 8;
  126. acc2 = vfmaq(acc2, vecIn, vecA);
  127. vecA = vld1q(pSrcA3Vec);
  128. pSrcA3Vec += 8;
  129. acc3 = vfmaq(acc3, vecIn, vecA);
  130. blkCnt--;
  131. }
  132. /*
  133. * tail
  134. * (will be merged thru tail predication)
  135. */
  136. blkCnt = numCols & 7;
  137. if (blkCnt > 0U) {
  138. mve_pred16_t p0 = vctp16q(blkCnt);
  139. f16x8_t vecA;
  140. vecIn = vldrhq_z_f16(pInVec, p0);
  141. vecA = vldrhq_z_f16(pSrcA0Vec, p0);
  142. acc0 = vfmaq(acc0, vecIn, vecA);
  143. vecA = vldrhq_z_f16(pSrcA1Vec, p0);
  144. acc1 = vfmaq(acc1, vecIn, vecA);
  145. vecA = vldrhq_z_f16(pSrcA2Vec, p0);
  146. acc2 = vfmaq(acc2, vecIn, vecA);
  147. vecA = vldrhq_z_f16(pSrcA3Vec, p0);
  148. acc3 = vfmaq(acc3, vecIn, vecA);
  149. }
  150. /*
  151. * Sum the partial parts
  152. */
  153. f16x8_t vtmp = vuninitializedq_f16();
  154. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
  155. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
  156. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
  157. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
  158. vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
  159. arm_vec_exponent_f16
  160. (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
  161. S->degree),vctp16q(4));
  162. pDualCoef += 4;
  163. pSrcA += numCols * 4;
  164. /*
  165. * Decrement the row loop counter
  166. */
  167. row -= 4;
  168. }
  169. /*
  170. * compute 2 rows in parrallel
  171. */
  172. if (row >= 2) {
  173. float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
  174. f16x8_t vecIn, acc0, acc1;
  175. float16_t const *pSrcVecPtr = in;
  176. /*
  177. * Initialize the pointers to 2 consecutive MatrixA rows
  178. */
  179. pInA0 = pSrcA;
  180. pInA1 = pInA0 + numCols;
  181. /*
  182. * Initialize the vector pointer
  183. */
  184. pInVec = pSrcVecPtr;
  185. /*
  186. * reset accumulators
  187. */
  188. acc0 = vdupq_n_f16(0.0f);
  189. acc1 = vdupq_n_f16(0.0f);
  190. pSrcA0Vec = pInA0;
  191. pSrcA1Vec = pInA1;
  192. blkCnt = numCols >> 3;
  193. while (blkCnt > 0U) {
  194. f16x8_t vecA;
  195. vecIn = vld1q(pInVec);
  196. pInVec += 8;
  197. vecA = vld1q(pSrcA0Vec);
  198. pSrcA0Vec += 8;
  199. acc0 = vfmaq(acc0, vecIn, vecA);
  200. vecA = vld1q(pSrcA1Vec);
  201. pSrcA1Vec += 8;
  202. acc1 = vfmaq(acc1, vecIn, vecA);
  203. blkCnt--;
  204. }
  205. /*
  206. * tail
  207. * (will be merged thru tail predication)
  208. */
  209. blkCnt = numCols & 7;
  210. if (blkCnt > 0U) {
  211. mve_pred16_t p0 = vctp16q(blkCnt);
  212. f16x8_t vecA;
  213. vecIn = vldrhq_z_f16(pInVec, p0);
  214. vecA = vldrhq_z_f16(pSrcA0Vec, p0);
  215. acc0 = vfmaq(acc0, vecIn, vecA);
  216. vecA = vldrhq_z_f16(pSrcA1Vec, p0);
  217. acc1 = vfmaq(acc1, vecIn, vecA);
  218. }
  219. /*
  220. * Sum the partial parts
  221. */
  222. f16x8_t vtmp = vuninitializedq_f16();
  223. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
  224. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
  225. vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
  226. arm_vec_exponent_f16
  227. (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
  228. vctp16q(2));
  229. pDualCoef += 2;
  230. pSrcA += numCols * 2;
  231. row -= 2;
  232. }
  233. if (row >= 1) {
  234. f16x8_t vecIn, acc0;
  235. float16_t const *pSrcA0Vec, *pInVec;
  236. float16_t const *pSrcVecPtr = in;
  237. /*
  238. * Initialize the pointers to last MatrixA row
  239. */
  240. pInA0 = pSrcA;
  241. /*
  242. * Initialize the vector pointer
  243. */
  244. pInVec = pSrcVecPtr;
  245. /*
  246. * reset accumulators
  247. */
  248. acc0 = vdupq_n_f16(0.0f);
  249. pSrcA0Vec = pInA0;
  250. blkCnt = numCols >> 3;
  251. while (blkCnt > 0U) {
  252. f16x8_t vecA;
  253. vecIn = vld1q(pInVec);
  254. pInVec += 8;
  255. vecA = vld1q(pSrcA0Vec);
  256. pSrcA0Vec += 8;
  257. acc0 = vfmaq(acc0, vecIn, vecA);
  258. blkCnt--;
  259. }
  260. /*
  261. * tail
  262. * (will be merged thru tail predication)
  263. */
  264. blkCnt = numCols & 7;
  265. if (blkCnt > 0U) {
  266. mve_pred16_t p0 = vctp16q(blkCnt);
  267. f16x8_t vecA;
  268. vecIn = vldrhq_z_f16(pInVec, p0);
  269. vecA = vldrhq_z_f16(pSrcA0Vec, p0);
  270. acc0 = vfmaq(acc0, vecIn, vecA);
  271. }
  272. /*
  273. * Sum the partial parts
  274. */
  275. f16x8_t vtmp = vuninitializedq_f16();
  276. vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
  277. vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
  278. arm_vec_exponent_f16
  279. (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
  280. vctp16q(1));
  281. }
  282. sum += (_Float16)vecAddAcrossF16Mve(vSum);
  283. *pResult = S->classes[STEP(sum)];
  284. }
  285. #else
  286. /**
  287. * @brief SVM polynomial prediction
  288. * @param[in] S Pointer to an instance of the polynomial SVM structure.
  289. * @param[in] in Pointer to input vector
  290. * @param[out] pResult Decision value
  291. * @return none.
  292. *
  293. */
  294. void arm_svm_polynomial_predict_f16(
  295. const arm_svm_polynomial_instance_f16 *S,
  296. const float16_t * in,
  297. int32_t * pResult)
  298. {
  299. _Float16 sum=S->intercept;
  300. _Float16 dot=0;
  301. uint32_t i,j;
  302. const float16_t *pSupport = S->supportVectors;
  303. for(i=0; i < S->nbOfSupportVectors; i++)
  304. {
  305. dot=0;
  306. for(j=0; j < S->vectorDimension; j++)
  307. {
  308. dot = (_Float16)dot + (_Float16)in[j]* (_Float16)*pSupport++;
  309. }
  310. sum += (_Float16)S->dualCoefficients[i] * (_Float16)arm_exponent_f16((_Float16)S->gamma * (_Float16)dot + (_Float16)S->coef0, S->degree);
  311. }
  312. *pResult=S->classes[STEP(sum)];
  313. }
  314. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  315. /**
  316. * @} end of polysvm group
  317. */
  318. #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */