arm_vec_math_f16.h 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. /******************************************************************************
  2. * @file arm_vec_math_f16.h
  3. * @brief Public header file for CMSIS DSP Library
  4. ******************************************************************************/
  5. /*
  6. * Copyright (c) 2010-2020 Arm Limited or its affiliates. All rights reserved.
  7. *
  8. * SPDX-License-Identifier: Apache-2.0
  9. *
  10. * Licensed under the Apache License, Version 2.0 (the License); you may
  11. * not use this file except in compliance with the License.
  12. * You may obtain a copy of the License at
  13. *
  14. * www.apache.org/licenses/LICENSE-2.0
  15. *
  16. * Unless required by applicable law or agreed to in writing, software
  17. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  18. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  19. * See the License for the specific language governing permissions and
  20. * limitations under the License.
  21. */
  22. #ifndef _ARM_VEC_MATH_F16_H
  23. #define _ARM_VEC_MATH_F16_H
  24. #include "arm_math_types_f16.h"
  25. #include "arm_common_tables_f16.h"
  26. #include "arm_helium_utils.h"
  27. #ifdef __cplusplus
  28. extern "C"
  29. {
  30. #endif
  31. #if defined(ARM_FLOAT16_SUPPORTED)
  32. #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
  33. static const float16_t __logf_rng_f16=0.693147180f16;
  34. /* fast inverse approximation (3x newton) */
  35. __STATIC_INLINE f16x8_t vrecip_medprec_f16(
  36. f16x8_t x)
  37. {
  38. q15x8_t m;
  39. f16x8_t b;
  40. any16x8_t xinv;
  41. f16x8_t ax = vabsq(x);
  42. xinv.f = ax;
  43. m = 0x03c00 - (xinv.i & 0x07c00);
  44. xinv.i = xinv.i + m;
  45. xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
  46. xinv.i = xinv.i + m;
  47. b = 2.0f16 - xinv.f * ax;
  48. xinv.f = xinv.f * b;
  49. b = 2.0f16 - xinv.f * ax;
  50. xinv.f = xinv.f * b;
  51. b = 2.0f16 - xinv.f * ax;
  52. xinv.f = xinv.f * b;
  53. xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f));
  54. /*
  55. * restore sign
  56. */
  57. xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
  58. return xinv.f;
  59. }
  60. /* fast inverse approximation (4x newton) */
  61. __STATIC_INLINE f16x8_t vrecip_hiprec_f16(
  62. f16x8_t x)
  63. {
  64. q15x8_t m;
  65. f16x8_t b;
  66. any16x8_t xinv;
  67. f16x8_t ax = vabsq(x);
  68. xinv.f = ax;
  69. m = 0x03c00 - (xinv.i & 0x07c00);
  70. xinv.i = xinv.i + m;
  71. xinv.f = 1.41176471f16 - 0.47058824f16 * xinv.f;
  72. xinv.i = xinv.i + m;
  73. b = 2.0f16 - xinv.f * ax;
  74. xinv.f = xinv.f * b;
  75. b = 2.0f16 - xinv.f * ax;
  76. xinv.f = xinv.f * b;
  77. b = 2.0f16 - xinv.f * ax;
  78. xinv.f = xinv.f * b;
  79. b = 2.0f16 - xinv.f * ax;
  80. xinv.f = xinv.f * b;
  81. xinv.f = vdupq_m(xinv.f, F16INFINITY, vcmpeqq(x, 0.0f));
  82. /*
  83. * restore sign
  84. */
  85. xinv.f = vnegq_m(xinv.f, xinv.f, vcmpltq(x, 0.0f));
  86. return xinv.f;
  87. }
  88. __STATIC_INLINE f16x8_t vdiv_f16(
  89. f16x8_t num, f16x8_t den)
  90. {
  91. return vmulq(num, vrecip_hiprec_f16(den));
  92. }
  93. /**
  94. @brief Single-precision taylor dev.
  95. @param[in] x f16 vector input
  96. @param[in] coeffs f16 vector coeffs
  97. @return destination f16 vector
  98. */
  99. __STATIC_INLINE float16x8_t vtaylor_polyq_f16(
  100. float16x8_t x,
  101. const float16_t * coeffs)
  102. {
  103. float16x8_t A = vfmasq(vdupq_n_f16(coeffs[4]), x, coeffs[0]);
  104. float16x8_t B = vfmasq(vdupq_n_f16(coeffs[6]), x, coeffs[2]);
  105. float16x8_t C = vfmasq(vdupq_n_f16(coeffs[5]), x, coeffs[1]);
  106. float16x8_t D = vfmasq(vdupq_n_f16(coeffs[7]), x, coeffs[3]);
  107. float16x8_t x2 = vmulq(x, x);
  108. float16x8_t x4 = vmulq(x2, x2);
  109. float16x8_t res = vfmaq(vfmaq_f16(A, B, x2), vfmaq_f16(C, D, x2), x4);
  110. return res;
  111. }
  112. __STATIC_INLINE float16x8_t vmant_exp_f16(
  113. float16x8_t x,
  114. int16x8_t * e)
  115. {
  116. any16x8_t r;
  117. int16x8_t n;
  118. r.f = x;
  119. n = r.i >> 10;
  120. n = n - 15;
  121. r.i = r.i - (n << 10);
  122. *e = n;
  123. return r.f;
  124. }
  125. __STATIC_INLINE float16x8_t vlogq_f16(float16x8_t vecIn)
  126. {
  127. q15x8_t vecExpUnBiased;
  128. float16x8_t vecTmpFlt0, vecTmpFlt1;
  129. float16x8_t vecAcc0, vecAcc1, vecAcc2, vecAcc3;
  130. float16x8_t vecExpUnBiasedFlt;
  131. /*
  132. * extract exponent
  133. */
  134. vecTmpFlt1 = vmant_exp_f16(vecIn, &vecExpUnBiased);
  135. vecTmpFlt0 = vecTmpFlt1 * vecTmpFlt1;
  136. /*
  137. * a = (__logf_lut_f16[4] * r.f) + (__logf_lut_f16[0]);
  138. */
  139. vecAcc0 = vdupq_n_f16(__logf_lut_f16[0]);
  140. vecAcc0 = vfmaq(vecAcc0, vecTmpFlt1, __logf_lut_f16[4]);
  141. /*
  142. * b = (__logf_lut_f16[6] * r.f) + (__logf_lut_f16[2]);
  143. */
  144. vecAcc1 = vdupq_n_f16(__logf_lut_f16[2]);
  145. vecAcc1 = vfmaq(vecAcc1, vecTmpFlt1, __logf_lut_f16[6]);
  146. /*
  147. * c = (__logf_lut_f16[5] * r.f) + (__logf_lut_f16[1]);
  148. */
  149. vecAcc2 = vdupq_n_f16(__logf_lut_f16[1]);
  150. vecAcc2 = vfmaq(vecAcc2, vecTmpFlt1, __logf_lut_f16[5]);
  151. /*
  152. * d = (__logf_lut_f16[7] * r.f) + (__logf_lut_f16[3]);
  153. */
  154. vecAcc3 = vdupq_n_f16(__logf_lut_f16[3]);
  155. vecAcc3 = vfmaq(vecAcc3, vecTmpFlt1, __logf_lut_f16[7]);
  156. /*
  157. * a = a + b * xx;
  158. */
  159. vecAcc0 = vfmaq(vecAcc0, vecAcc1, vecTmpFlt0);
  160. /*
  161. * c = c + d * xx;
  162. */
  163. vecAcc2 = vfmaq(vecAcc2, vecAcc3, vecTmpFlt0);
  164. /*
  165. * xx = xx * xx;
  166. */
  167. vecTmpFlt0 = vecTmpFlt0 * vecTmpFlt0;
  168. vecExpUnBiasedFlt = vcvtq_f16_s16(vecExpUnBiased);
  169. /*
  170. * r.f = a + c * xx;
  171. */
  172. vecAcc0 = vfmaq(vecAcc0, vecAcc2, vecTmpFlt0);
  173. /*
  174. * add exponent
  175. * r.f = r.f + ((float32_t) m) * __logf_rng_f16;
  176. */
  177. vecAcc0 = vfmaq(vecAcc0, vecExpUnBiasedFlt, __logf_rng_f16);
  178. // set log0 down to -inf
  179. vecAcc0 = vdupq_m(vecAcc0, -F16INFINITY, vcmpeqq(vecIn, 0.0f));
  180. return vecAcc0;
  181. }
  182. __STATIC_INLINE float16x8_t vexpq_f16(
  183. float16x8_t x)
  184. {
  185. // Perform range reduction [-log(2),log(2)]
  186. int16x8_t m = vcvtq_s16_f16(vmulq_n_f16(x, 1.4426950408f16));
  187. float16x8_t val = vfmsq_f16(x, vcvtq_f16_s16(m), vdupq_n_f16(0.6931471805f16));
  188. // Polynomial Approximation
  189. float16x8_t poly = vtaylor_polyq_f16(val, exp_tab_f16);
  190. // Reconstruct
  191. poly = (float16x8_t) (vqaddq_s16((int16x8_t) (poly), vqshlq_n_s16(m, 10)));
  192. poly = vdupq_m(poly, 0.0f, vcmpltq_n_s16(m, -14));
  193. return poly;
  194. }
  195. __STATIC_INLINE float16x8_t arm_vec_exponent_f16(float16x8_t x, int16_t nb)
  196. {
  197. float16x8_t r = x;
  198. nb--;
  199. while (nb > 0) {
  200. r = vmulq(r, x);
  201. nb--;
  202. }
  203. return (r);
  204. }
  205. __STATIC_INLINE f16x8_t vpowq_f16(
  206. f16x8_t val,
  207. f16x8_t n)
  208. {
  209. return vexpq_f16(vmulq_f16(n, vlogq_f16(val)));
  210. }
  211. #define INV_NEWTON_INIT_F16 0x7773
  212. __STATIC_INLINE f16x8_t vrecip_f16(f16x8_t vecIn)
  213. {
  214. f16x8_t vecSx, vecW, vecTmp;
  215. any16x8_t v;
  216. vecSx = vabsq(vecIn);
  217. v.f = vecIn;
  218. v.i = vsubq(vdupq_n_s16(INV_NEWTON_INIT_F16), v.i);
  219. vecW = vmulq(vecSx, v.f);
  220. // v.f = v.f * (8 + w * (-28 + w * (56 + w * (-70 + w *(56 + w * (-28 + w * (8 - w)))))));
  221. vecTmp = vsubq(vdupq_n_f16(8.0f), vecW);
  222. vecTmp = vfmasq(vecW, vecTmp, -28.0f);
  223. vecTmp = vfmasq(vecW, vecTmp, 56.0f);
  224. vecTmp = vfmasq(vecW, vecTmp, -70.0f);
  225. vecTmp = vfmasq(vecW, vecTmp, 56.0f);
  226. vecTmp = vfmasq(vecW, vecTmp, -28.0f);
  227. vecTmp = vfmasq(vecW, vecTmp, 8.0f);
  228. v.f = vmulq(v.f, vecTmp);
  229. v.f = vdupq_m(v.f, F16INFINITY, vcmpeqq(vecIn, 0.0f));
  230. /*
  231. * restore sign
  232. */
  233. v.f = vnegq_m(v.f, v.f, vcmpltq(vecIn, 0.0f));
  234. return v.f;
  235. }
  236. __STATIC_INLINE f16x8_t vtanhq_f16(
  237. f16x8_t val)
  238. {
  239. f16x8_t x =
  240. vminnmq_f16(vmaxnmq_f16(val, vdupq_n_f16(-10.f)), vdupq_n_f16(10.0f));
  241. f16x8_t exp2x = vexpq_f16(vmulq_n_f16(x, 2.f));
  242. f16x8_t num = vsubq_n_f16(exp2x, 1.f);
  243. f16x8_t den = vaddq_n_f16(exp2x, 1.f);
  244. f16x8_t tanh = vmulq_f16(num, vrecip_f16(den));
  245. return tanh;
  246. }
  247. #endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
  248. #ifdef __cplusplus
  249. }
  250. #endif
  251. #endif /* ARM FLOAT16 SUPPORTED */
  252. #endif /* _ARM_VEC_MATH_F16_H */
  253. /**
  254. *
  255. * End of file.
  256. */