arm_rfft_fast_f16.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_rfft_fast_f16.c
  4. * Description: RFFT & RIFFT Floating point process function
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/transform_functions_f16.h"
  29. #include "arm_common_tables_f16.h"
  30. #if defined(ARM_FLOAT16_SUPPORTED)
  31. #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
  32. void stage_rfft_f16(
  33. const arm_rfft_fast_instance_f16 * S,
  34. float16_t * p,
  35. float16_t * pOut)
  36. {
  37. int32_t k; /* Loop Counter */
  38. float16_t twR, twI; /* RFFT Twiddle coefficients */
  39. const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
  40. float16_t *pA = p; /* increasing pointer */
  41. float16_t *pB = p; /* decreasing pointer */
  42. float16_t xAR, xAI, xBR, xBI; /* temporary variables */
  43. float16_t t1a, t1b; /* temporary variables */
  44. float16_t p0, p1, p2, p3; /* temporary variables */
  45. float16x8x2_t tw,xA,xB;
  46. float16x8x2_t tmp1, tmp2, res;
  47. uint16x8_t vecStridesBkwd;
  48. vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
  49. int blockCnt;
  50. k = (S->Sint).fftLen - 1;
  51. /* Pack first and last sample of the frequency domain together */
  52. xBR = pB[0];
  53. xBI = pB[1];
  54. xAR = pA[0];
  55. xAI = pA[1];
  56. twR = *pCoeff++ ;
  57. twI = *pCoeff++ ;
  58. // U1 = XA(1) + XB(1); % It is real
  59. t1a = (_Float16)xBR + (_Float16)xAR ;
  60. // U2 = XB(1) - XA(1); % It is imaginary
  61. t1b = (_Float16)xBI + (_Float16)xAI ;
  62. // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
  63. // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
  64. *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
  65. *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
  66. // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
  67. pB = p + 2*k - 14;
  68. pA += 2;
  69. blockCnt = k >> 3;
  70. while (blockCnt > 0)
  71. {
  72. /*
  73. function X = my_split_rfft(X, ifftFlag)
  74. % X is a series of real numbers
  75. L = length(X);
  76. XC = X(1:2:end) +i*X(2:2:end);
  77. XA = fft(XC);
  78. XB = conj(XA([1 end:-1:2]));
  79. TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
  80. for l = 2:L/2
  81. XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
  82. end
  83. XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
  84. X = XA;
  85. */
  86. xA = vld2q_f16(pA);
  87. pA += 16;
  88. xB = vld2q_f16(pB);
  89. xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
  90. xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
  91. xB.val[1] = vnegq_f16(xB.val[1]);
  92. pB -= 16;
  93. tw = vld2q_f16(pCoeff);
  94. pCoeff += 16;
  95. tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
  96. tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
  97. tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
  98. tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
  99. res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
  100. res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
  101. res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
  102. res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
  103. res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
  104. res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
  105. res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
  106. res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
  107. vst2q_f16(pOut, res);
  108. pOut += 16;
  109. blockCnt--;
  110. }
  111. pB += 14;
  112. blockCnt = k & 7;
  113. while (blockCnt > 0)
  114. {
  115. /*
  116. function X = my_split_rfft(X, ifftFlag)
  117. % X is a series of real numbers
  118. L = length(X);
  119. XC = X(1:2:end) +i*X(2:2:end);
  120. XA = fft(XC);
  121. XB = conj(XA([1 end:-1:2]));
  122. TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
  123. for l = 2:L/2
  124. XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
  125. end
  126. XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
  127. X = XA;
  128. */
  129. xBI = pB[1];
  130. xBR = pB[0];
  131. xAR = pA[0];
  132. xAI = pA[1];
  133. twR = *pCoeff++;
  134. twI = *pCoeff++;
  135. t1a = (_Float16)xBR - (_Float16)xAR ;
  136. t1b = (_Float16)xBI + (_Float16)xAI ;
  137. // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
  138. // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
  139. p0 = (_Float16)twR * (_Float16)t1a;
  140. p1 = (_Float16)twI * (_Float16)t1a;
  141. p2 = (_Float16)twR * (_Float16)t1b;
  142. p3 = (_Float16)twI * (_Float16)t1b;
  143. *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
  144. *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
  145. pA += 2;
  146. pB -= 2;
  147. blockCnt--;
  148. }
  149. }
  150. /* Prepares data for inverse cfft */
  151. void merge_rfft_f16(
  152. const arm_rfft_fast_instance_f16 * S,
  153. float16_t * p,
  154. float16_t * pOut)
  155. {
  156. int32_t k; /* Loop Counter */
  157. float16_t twR, twI; /* RFFT Twiddle coefficients */
  158. const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
  159. float16_t *pA = p; /* increasing pointer */
  160. float16_t *pB = p; /* decreasing pointer */
  161. float16_t xAR, xAI, xBR, xBI; /* temporary variables */
  162. float16_t t1a, t1b, r, s, t, u; /* temporary variables */
  163. float16x8x2_t tw,xA,xB;
  164. float16x8x2_t tmp1, tmp2, res;
  165. uint16x8_t vecStridesBkwd;
  166. vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
  167. int blockCnt;
  168. k = (S->Sint).fftLen - 1;
  169. xAR = pA[0];
  170. xAI = pA[1];
  171. pCoeff += 2 ;
  172. *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
  173. *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
  174. pB = p + 2*k - 14;
  175. pA += 2 ;
  176. blockCnt = k >> 3;
  177. while (blockCnt > 0)
  178. {
  179. /* G is half of the frequency complex spectrum */
  180. //for k = 2:N
  181. // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
  182. xA = vld2q_f16(pA);
  183. pA += 16;
  184. xB = vld2q_f16(pB);
  185. xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
  186. xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
  187. xB.val[1] = vnegq_f16(xB.val[1]);
  188. pB -= 16;
  189. tw = vld2q_f16(pCoeff);
  190. tw.val[1] = vnegq_f16(tw.val[1]);
  191. pCoeff += 16;
  192. tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
  193. tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
  194. tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
  195. tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
  196. res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
  197. res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
  198. res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
  199. res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
  200. res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
  201. res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
  202. res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
  203. res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
  204. vst2q_f16(pOut, res);
  205. pOut += 16;
  206. blockCnt--;
  207. }
  208. pB += 14;
  209. blockCnt = k & 7;
  210. while (blockCnt > 0)
  211. {
  212. /* G is half of the frequency complex spectrum */
  213. //for k = 2:N
  214. // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
  215. xBI = pB[1] ;
  216. xBR = pB[0] ;
  217. xAR = pA[0];
  218. xAI = pA[1];
  219. twR = *pCoeff++;
  220. twI = *pCoeff++;
  221. t1a = (_Float16)xAR - (_Float16)xBR ;
  222. t1b = (_Float16)xAI + (_Float16)xBI ;
  223. r = (_Float16)twR * (_Float16)t1a;
  224. s = (_Float16)twI * (_Float16)t1b;
  225. t = (_Float16)twI * (_Float16)t1a;
  226. u = (_Float16)twR * (_Float16)t1b;
  227. // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
  228. // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
  229. *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
  230. *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
  231. pA += 2;
  232. pB -= 2;
  233. blockCnt--;
  234. }
  235. }
  236. #else
  237. void stage_rfft_f16(
  238. const arm_rfft_fast_instance_f16 * S,
  239. float16_t * p,
  240. float16_t * pOut)
  241. {
  242. int32_t k; /* Loop Counter */
  243. float16_t twR, twI; /* RFFT Twiddle coefficients */
  244. const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
  245. float16_t *pA = p; /* increasing pointer */
  246. float16_t *pB = p; /* decreasing pointer */
  247. float16_t xAR, xAI, xBR, xBI; /* temporary variables */
  248. float16_t t1a, t1b; /* temporary variables */
  249. float16_t p0, p1, p2, p3; /* temporary variables */
  250. k = (S->Sint).fftLen - 1;
  251. /* Pack first and last sample of the frequency domain together */
  252. xBR = pB[0];
  253. xBI = pB[1];
  254. xAR = pA[0];
  255. xAI = pA[1];
  256. twR = *pCoeff++ ;
  257. twI = *pCoeff++ ;
  258. // U1 = XA(1) + XB(1); % It is real
  259. t1a = (_Float16)xBR + (_Float16)xAR ;
  260. // U2 = XB(1) - XA(1); % It is imaginary
  261. t1b = (_Float16)xBI + (_Float16)xAI ;
  262. // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
  263. // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
  264. *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
  265. *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
  266. // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
  267. pB = p + 2*k;
  268. pA += 2;
  269. do
  270. {
  271. /*
  272. function X = my_split_rfft(X, ifftFlag)
  273. % X is a series of real numbers
  274. L = length(X);
  275. XC = X(1:2:end) +i*X(2:2:end);
  276. XA = fft(XC);
  277. XB = conj(XA([1 end:-1:2]));
  278. TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
  279. for l = 2:L/2
  280. XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
  281. end
  282. XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
  283. X = XA;
  284. */
  285. xBI = pB[1];
  286. xBR = pB[0];
  287. xAR = pA[0];
  288. xAI = pA[1];
  289. twR = *pCoeff++;
  290. twI = *pCoeff++;
  291. t1a = (_Float16)xBR - (_Float16)xAR ;
  292. t1b = (_Float16)xBI + (_Float16)xAI ;
  293. // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
  294. // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
  295. p0 = (_Float16)twR * (_Float16)t1a;
  296. p1 = (_Float16)twI * (_Float16)t1a;
  297. p2 = (_Float16)twR * (_Float16)t1b;
  298. p3 = (_Float16)twI * (_Float16)t1b;
  299. *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
  300. *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
  301. pA += 2;
  302. pB -= 2;
  303. k--;
  304. } while (k > 0);
  305. }
  306. /* Prepares data for inverse cfft */
  307. void merge_rfft_f16(
  308. const arm_rfft_fast_instance_f16 * S,
  309. float16_t * p,
  310. float16_t * pOut)
  311. {
  312. int32_t k; /* Loop Counter */
  313. float16_t twR, twI; /* RFFT Twiddle coefficients */
  314. const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
  315. float16_t *pA = p; /* increasing pointer */
  316. float16_t *pB = p; /* decreasing pointer */
  317. float16_t xAR, xAI, xBR, xBI; /* temporary variables */
  318. float16_t t1a, t1b, r, s, t, u; /* temporary variables */
  319. k = (S->Sint).fftLen - 1;
  320. xAR = pA[0];
  321. xAI = pA[1];
  322. pCoeff += 2 ;
  323. *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
  324. *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
  325. pB = p + 2*k ;
  326. pA += 2 ;
  327. while (k > 0)
  328. {
  329. /* G is half of the frequency complex spectrum */
  330. //for k = 2:N
  331. // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
  332. xBI = pB[1] ;
  333. xBR = pB[0] ;
  334. xAR = pA[0];
  335. xAI = pA[1];
  336. twR = *pCoeff++;
  337. twI = *pCoeff++;
  338. t1a = (_Float16)xAR - (_Float16)xBR ;
  339. t1b = (_Float16)xAI + (_Float16)xBI ;
  340. r = (_Float16)twR * (_Float16)t1a;
  341. s = (_Float16)twI * (_Float16)t1b;
  342. t = (_Float16)twI * (_Float16)t1a;
  343. u = (_Float16)twR * (_Float16)t1b;
  344. // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
  345. // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
  346. *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
  347. *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
  348. pA += 2;
  349. pB -= 2;
  350. k--;
  351. }
  352. }
  353. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  354. /**
  355. @ingroup RealFFT
  356. */
  357. /**
  358. @defgroup RealFFTF16 Real FFT F16 Functions
  359. */
  360. /**
  361. @addtogroup RealFFTF16
  362. @{
  363. */
  364. /**
  365. @brief Processing function for the floating-point real FFT.
  366. @param[in] S points to an arm_rfft_fast_instance_f16 structure
  367. @param[in] p points to input buffer (Source buffer is modified by this function.)
  368. @param[in] pOut points to output buffer
  369. @param[in] ifftFlag
  370. - value = 0: RFFT
  371. - value = 1: RIFFT
  372. */
  373. void arm_rfft_fast_f16(
  374. const arm_rfft_fast_instance_f16 * S,
  375. float16_t * p,
  376. float16_t * pOut,
  377. uint8_t ifftFlag)
  378. {
  379. const arm_cfft_instance_f16 * Sint = &(S->Sint);
  380. /* Calculation of Real FFT */
  381. if (ifftFlag)
  382. {
  383. /* Real FFT compression */
  384. merge_rfft_f16(S, p, pOut);
  385. /* Complex radix-4 IFFT process */
  386. arm_cfft_f16( Sint, pOut, ifftFlag, 1);
  387. }
  388. else
  389. {
  390. /* Calculation of RFFT of input */
  391. arm_cfft_f16( Sint, p, ifftFlag, 1);
  392. /* Real FFT extraction */
  393. stage_rfft_f16(S, p, pOut);
  394. }
  395. }
  396. /**
  397. * @} end of RealFFTF16 group
  398. */
  399. #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */