arm_vec_math.h 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. /******************************************************************************
  2. * @file arm_vec_math.h
  3. * @brief Public header file for CMSIS DSP Library
  4. * @version V1.10.0
  5. * @date 08 July 2021
  6. * Target Processor: Cortex-M and Cortex-A cores
  7. ******************************************************************************/
  8. /*
  9. * Copyright (c) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  10. *
  11. * SPDX-License-Identifier: Apache-2.0
  12. *
  13. * Licensed under the Apache License, Version 2.0 (the License); you may
  14. * not use this file except in compliance with the License.
  15. * You may obtain a copy of the License at
  16. *
  17. * www.apache.org/licenses/LICENSE-2.0
  18. *
  19. * Unless required by applicable law or agreed to in writing, software
  20. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  21. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22. * See the License for the specific language governing permissions and
  23. * limitations under the License.
  24. */
  25. #ifndef _ARM_VEC_MATH_H
  26. #define _ARM_VEC_MATH_H
  27. #include "arm_math_types.h"
  28. #include "arm_common_tables.h"
  29. #include "arm_helium_utils.h"
  30. #ifdef __cplusplus
  31. extern "C"
  32. {
  33. #endif
  34. #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
  35. #define INV_NEWTON_INIT_F32 0x7EF127EA
  36. static const float32_t __logf_rng_f32=0.693147180f;
  37. /* fast inverse approximation (3x newton) */
  38. __STATIC_INLINE f32x4_t vrecip_medprec_f32(
  39. f32x4_t x)
  40. {
  41. q31x4_t m;
  42. f32x4_t b;
  43. any32x4_t xinv;
  44. f32x4_t ax = vabsq(x);
  45. xinv.f = ax;
  46. m = 0x3F800000 - (xinv.i & 0x7F800000);
  47. xinv.i = xinv.i + m;
  48. xinv.f = 1.41176471f - 0.47058824f * xinv.f;
  49. xinv.i = xinv.i + m;
  50. b = 2.0f - xinv.f * ax;
  51. xinv.f = xinv.f * b;
  52. b = 2.0f - xinv.f * ax;
  53. xinv.f = xinv.f * b;
  54. b = 2.0f - xinv.f * ax;
  55. xinv.f = xinv.f * b;
  56. xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
  57. /*
  58. * restore sign
  59. */
  60. xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
  61. return xinv.f;
  62. }
  63. /* fast inverse approximation (4x newton) */
  64. __STATIC_INLINE f32x4_t vrecip_hiprec_f32(
  65. f32x4_t x)
  66. {
  67. q31x4_t m;
  68. f32x4_t b;
  69. any32x4_t xinv;
  70. f32x4_t ax = vabsq(x);
  71. xinv.f = ax;
  72. m = 0x3F800000 - (xinv.i & 0x7F800000);
  73. xinv.i = xinv.i + m;
  74. xinv.f = 1.41176471f - 0.47058824f * xinv.f;
  75. xinv.i = xinv.i + m;
  76. b = 2.0f - xinv.f * ax;
  77. xinv.f = xinv.f * b;
  78. b = 2.0f - xinv.f * ax;
  79. xinv.f = xinv.f * b;
  80. b = 2.0f - xinv.f * ax;
  81. xinv.f = xinv.f * b;
  82. b = 2.0f - xinv.f * ax;
  83. xinv.f = xinv.f * b;
  84. xinv.f = vdupq_m(xinv.f, INFINITY, vcmpeqq(x, 0.0f));
  85. /*
  86. * restore sign
  87. */
  88. xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
  89. return xinv.f;
  90. }
  91. __STATIC_INLINE f32x4_t vdiv_f32(
  92. f32x4_t num, f32x4_t den)
  93. {
  94. return vmulq(num, vrecip_hiprec_f32(den));
  95. }
  96. /**
  97. @brief Single-precision taylor dev.
  98. @param[in] x f32 quad vector input
  99. @param[in] coeffs f32 quad vector coeffs
  100. @return destination f32 quad vector
  101. */
  102. __STATIC_INLINE f32x4_t vtaylor_polyq_f32(
  103. f32x4_t x,
  104. const float32_t * coeffs)
  105. {
  106. f32x4_t A = vfmasq(vdupq_n_f32(coeffs[4]), x, coeffs[0]);
  107. f32x4_t B = vfmasq(vdupq_n_f32(coeffs[6]), x, coeffs[2]);
  108. f32x4_t C = vfmasq(vdupq_n_f32(coeffs[5]), x, coeffs[1]);
  109. f32x4_t D = vfmasq(vdupq_n_f32(coeffs[7]), x, coeffs[3]);
  110. f32x4_t x2 = vmulq(x, x);
  111. f32x4_t x4 = vmulq(x2, x2);
  112. f32x4_t res = vfmaq(vfmaq_f32(A, B, x2), vfmaq_f32(C, D, x2), x4);
  113. return res;
  114. }
  115. __STATIC_INLINE f32x4_t vmant_exp_f32(
  116. f32x4_t x,
  117. int32x4_t * e)
  118. {
  119. any32x4_t r;
  120. int32x4_t n;
  121. r.f = x;
  122. n = r.i >> 23;
  123. n = n - 127;
  124. r.i = r.i - (n << 23);
  125. *e = n;
  126. return r.f;
  127. }
  128. __STATIC_INLINE f32x4_t vlogq_f32(f32x4_t vecIn)
  129. {
  130. q31x4_t vecExpUnBiased;
  131. f32x4_t vecTmpFlt0, vecTmpFlt1;
  132. f32x4_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
  133. f32x4_t vecExpUnBiasedFlt;
  134. /*
  135. * extract exponent
  136. */
  137. vecTmpFlt1 = vmant_exp_f32(vecIn, &vecExpUnBiased);
  138. vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
  139. /*
  140. * a = (__logf_lut_f32[4] * r.f) + (__logf_lut_f32[0]);
  141. */
  142. vecAcc0 = vdupq_n_f32(__logf_lut_f32[0]);
  143. vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f32[4]);
  144. /*
  145. * b = (__logf_lut_f32[6] * r.f) + (__logf_lut_f32[2]);
  146. */
  147. vecAcc1 = vdupq_n_f32(__logf_lut_f32[2]);
  148. vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f32[6]);
  149. /*
  150. * c = (__logf_lut_f32[5] * r.f) + (__logf_lut_f32[1]);
  151. */
  152. vecAcc2 = vdupq_n_f32(__logf_lut_f32[1]);
  153. vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f32[5]);
  154. /*
  155. * d = (__logf_lut_f32[7] * r.f) + (__logf_lut_f32[3]);
  156. */
  157. vecAcc3 = vdupq_n_f32(__logf_lut_f32[3]);
  158. vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f32[7]);
  159. /*
  160. * a = a + b * xx;
  161. */
  162. vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
  163. /*
  164. * c = c + d * xx;
  165. */
  166. vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
  167. /*
  168. * xx = xx * xx;
  169. */
  170. vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
  171. vecExpUnBiasedFlt = vcvtq_f32_s32(vecExpUnBiased);
  172. /*
  173. * r.f = a + c * xx;
  174. */
  175. vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
  176. /*
  177. * add exponent
  178. * r.f = r.f + ((float32_t) m) * __logf_rng_f32;
  179. */
  180. vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f32);
  181. // set log0 down to -inf
  182. vecAcc0 = vdupq_m(vecAcc0, -INFINITY, vcmpeqq(vecIn, 0.0f));
  183. return vecAcc0;
  184. }
  185. __STATIC_INLINE f32x4_t vexpq_f32(
  186. f32x4_t x)
  187. {
  188. // Perform range reduction [-log(2),log(2)]
  189. int32x4_t m = vcvtq_s32_f32(vmulq_n_f32(x, 1.4426950408f));
  190. f32x4_t val = vfmsq_f32(x, vcvtq_f32_s32(m), vdupq_n_f32(0.6931471805f));
  191. // Polynomial Approximation
  192. f32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
  193. // Reconstruct
  194. poly = (f32x4_t) (vqaddq_s32((q31x4_t) (poly), vqshlq_n_s32(m, 23)));
  195. poly = vdupq_m(poly, 0.0f, vcmpltq_n_s32(m, -126));
  196. return poly;
  197. }
  198. __STATIC_INLINE f32x4_t arm_vec_exponent_f32(f32x4_t x, int32_t nb)
  199. {
  200. f32x4_t r = x;
  201. nb--;
  202. while (nb > 0) {
  203. r = vmulq(r, x);
  204. nb--;
  205. }
  206. return (r);
  207. }
  208. __STATIC_INLINE f32x4_t vrecip_f32(f32x4_t vecIn)
  209. {
  210. f32x4_t vecSx, vecW, vecTmp;
  211. any32x4_t v;
  212. vecSx = vabsq(vecIn);
  213. v.f = vecIn;
  214. v.i = vsubq(vdupq_n_s32(INV_NEWTON_INIT_F32), v.i);
  215. vecW = vmulq(vecSx, v.f);
  216. // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
  217. vecTmp = vsubq(vdupq_n_f32(8.0f), vecW);
  218. vecTmp = vfmasq(vecW, vecTmp, -28.0f);
  219. vecTmp = vfmasq(vecW, vecTmp, 56.0f);
  220. vecTmp = vfmasq(vecW, vecTmp, -70.0f);
  221. vecTmp = vfmasq(vecW, vecTmp, 56.0f);
  222. vecTmp = vfmasq(vecW, vecTmp, -28.0f);
  223. vecTmp = vfmasq(vecW, vecTmp, 8.0f);
  224. v.f = vmulq(v.f, vecTmp);
  225. v.f = vdupq_m(v.f, INFINITY, vcmpeqq(vecIn, 0.0f));
  226. /*
  227. * restore sign
  228. */
  229. v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
  230. return v.f;
  231. }
  232. __STATIC_INLINE f32x4_t vtanhq_f32(
  233. f32x4_t val)
  234. {
  235. f32x4_t x =
  236. vminnmq_f32(vmaxnmq_f32(val, vdupq_n_f32(-10.f)), vdupq_n_f32(10.0f));
  237. f32x4_t exp2x = vexpq_f32(vmulq_n_f32(x, 2.f));
  238. f32x4_t num = vsubq_n_f32(exp2x, 1.f);
  239. f32x4_t den = vaddq_n_f32(exp2x, 1.f);
  240. f32x4_t tanh = vmulq_f32(num, vrecip_f32(den));
  241. return tanh;
  242. }
  243. __STATIC_INLINE f32x4_t vpowq_f32(
  244. f32x4_t val,
  245. f32x4_t n)
  246. {
  247. return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
  248. }
  249. #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
  250. #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
  251. #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
  252. #if (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
  253. #include "NEMath.h"
  254. /**
  255. * @brief Vectorized integer exponentiation
  256. * @param[in] x value
  257. * @param[in] nb integer exponent >= 1
  258. * @return x^nb
  259. *
  260. */
  261. __STATIC_INLINE float32x4_t arm_vec_exponent_f32(float32x4_t x, int32_t nb)
  262. {
  263. float32x4_t r = x;
  264. nb --;
  265. while(nb > 0)
  266. {
  267. r = vmulq_f32(r , x);
  268. nb--;
  269. }
  270. return(r);
  271. }
  272. __STATIC_INLINE float32x4_t __arm_vec_sqrt_f32_neon(float32x4_t x)
  273. {
  274. float32x4_t x1 = vmaxq_f32(x, vdupq_n_f32(FLT_MIN));
  275. float32x4_t e = vrsqrteq_f32(x1);
  276. e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
  277. e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e);
  278. return vmulq_f32(x, e);
  279. }
  280. __STATIC_INLINE int16x8_t __arm_vec_sqrt_q15_neon(int16x8_t vec)
  281. {
  282. float32x4_t tempF;
  283. int32x4_t tempHI,tempLO;
  284. tempLO = vmovl_s16(vget_low_s16(vec));
  285. tempF = vcvtq_n_f32_s32(tempLO,15);
  286. tempF = __arm_vec_sqrt_f32_neon(tempF);
  287. tempLO = vcvtq_n_s32_f32(tempF,15);
  288. tempHI = vmovl_s16(vget_high_s16(vec));
  289. tempF = vcvtq_n_f32_s32(tempHI,15);
  290. tempF = __arm_vec_sqrt_f32_neon(tempF);
  291. tempHI = vcvtq_n_s32_f32(tempF,15);
  292. return(vcombine_s16(vqmovn_s32(tempLO),vqmovn_s32(tempHI)));
  293. }
  294. __STATIC_INLINE int32x4_t __arm_vec_sqrt_q31_neon(int32x4_t vec)
  295. {
  296. float32x4_t temp;
  297. temp = vcvtq_n_f32_s32(vec,31);
  298. temp = __arm_vec_sqrt_f32_neon(temp);
  299. return(vcvtq_n_s32_f32(temp,31));
  300. }
  301. #endif /* (defined(ARM_MATH_NEON) || defined(ARM_MATH_NEON_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE) */
  302. #ifdef __cplusplus
  303. }
  304. #endif
  305. #endif /* _ARM_VEC_MATH_H */
  306. /**
  307. *
  308. * End of file.
  309. */