arm_cfft_f32.c 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_cfft_f32.c
  4. * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
  5. *
  6. * $Date: 23 April 2021
  7. * $Revision: V1.9.0
  8. *
  9. * Target Processor: Cortex-M and Cortex-A cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/transform_functions.h"
  29. #include "arm_common_tables.h"
  30. #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
  31. #include "arm_helium_utils.h"
  32. #include "arm_vec_fft.h"
  33. #include "arm_mve_tables.h"
  34. static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
  35. {
  36. float32_t retValue=1.0;
  37. switch (fftLen)
  38. {
  39. case 4096U:
  40. retValue = 0.000244140625;
  41. break;
  42. case 2048U:
  43. retValue = 0.00048828125;
  44. break;
  45. case 1024U:
  46. retValue = 0.0009765625f;
  47. break;
  48. case 512U:
  49. retValue = 0.001953125;
  50. break;
  51. case 256U:
  52. retValue = 0.00390625f;
  53. break;
  54. case 128U:
  55. retValue = 0.0078125;
  56. break;
  57. case 64U:
  58. retValue = 0.015625f;
  59. break;
  60. case 32U:
  61. retValue = 0.03125;
  62. break;
  63. case 16U:
  64. retValue = 0.0625f;
  65. break;
  66. default:
  67. break;
  68. }
  69. return(retValue);
  70. }
  71. static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
  72. {
  73. f32x4_t vecTmp0, vecTmp1;
  74. f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
  75. f32x4_t vecA, vecB, vecC, vecD;
  76. uint32_t blkCnt;
  77. uint32_t n1, n2;
  78. uint32_t stage = 0;
  79. int32_t iter = 1;
  80. static const int32_t strides[4] = {
  81. (0 - 16) * (int32_t)sizeof(q31_t *),
  82. (1 - 16) * (int32_t)sizeof(q31_t *),
  83. (8 - 16) * (int32_t)sizeof(q31_t *),
  84. (9 - 16) * (int32_t)sizeof(q31_t *)
  85. };
  86. n2 = fftLen;
  87. n1 = n2;
  88. n2 >>= 2u;
  89. for (int k = fftLen / 4u; k > 1; k >>= 2)
  90. {
  91. float32_t const *p_rearranged_twiddle_tab_stride1 =
  92. &S->rearranged_twiddle_stride1[
  93. S->rearranged_twiddle_tab_stride1_arr[stage]];
  94. float32_t const *p_rearranged_twiddle_tab_stride2 =
  95. &S->rearranged_twiddle_stride2[
  96. S->rearranged_twiddle_tab_stride2_arr[stage]];
  97. float32_t const *p_rearranged_twiddle_tab_stride3 =
  98. &S->rearranged_twiddle_stride3[
  99. S->rearranged_twiddle_tab_stride3_arr[stage]];
  100. float32_t * pBase = pSrc;
  101. for (int i = 0; i < iter; i++)
  102. {
  103. float32_t *inA = pBase;
  104. float32_t *inB = inA + n2 * CMPLX_DIM;
  105. float32_t *inC = inB + n2 * CMPLX_DIM;
  106. float32_t *inD = inC + n2 * CMPLX_DIM;
  107. float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
  108. float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
  109. float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
  110. f32x4_t vecW;
  111. blkCnt = n2 / 2;
  112. /*
  113. * load 2 f32 complex pair
  114. */
  115. vecA = vldrwq_f32(inA);
  116. vecC = vldrwq_f32(inC);
  117. while (blkCnt > 0U)
  118. {
  119. vecB = vldrwq_f32(inB);
  120. vecD = vldrwq_f32(inD);
  121. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  122. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  123. vecSum1 = vecB + vecD;
  124. vecDiff1 = vecB - vecD;
  125. /*
  126. * [ 1 1 1 1 ] * [ A B C D ]' .* 1
  127. */
  128. vecTmp0 = vecSum0 + vecSum1;
  129. vst1q(inA, vecTmp0);
  130. inA += 4;
  131. /*
  132. * [ 1 -1 1 -1 ] * [ A B C D ]'
  133. */
  134. vecTmp0 = vecSum0 - vecSum1;
  135. /*
  136. * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
  137. */
  138. vecW = vld1q(pW2);
  139. pW2 += 4;
  140. vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
  141. vst1q(inB, vecTmp1);
  142. inB += 4;
  143. /*
  144. * [ 1 -i -1 +i ] * [ A B C D ]'
  145. */
  146. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  147. /*
  148. * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
  149. */
  150. vecW = vld1q(pW1);
  151. pW1 +=4;
  152. vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
  153. vst1q(inC, vecTmp1);
  154. inC += 4;
  155. /*
  156. * [ 1 +i -1 -i ] * [ A B C D ]'
  157. */
  158. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  159. /*
  160. * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
  161. */
  162. vecW = vld1q(pW3);
  163. pW3 += 4;
  164. vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
  165. vst1q(inD, vecTmp1);
  166. inD += 4;
  167. vecA = vldrwq_f32(inA);
  168. vecC = vldrwq_f32(inC);
  169. blkCnt--;
  170. }
  171. pBase += CMPLX_DIM * n1;
  172. }
  173. n1 = n2;
  174. n2 >>= 2u;
  175. iter = iter << 2;
  176. stage++;
  177. }
  178. /*
  179. * start of Last stage process
  180. */
  181. uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
  182. vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
  183. /* load scheduling */
  184. vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  185. vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
  186. blkCnt = (fftLen >> 3);
  187. while (blkCnt > 0U)
  188. {
  189. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  190. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  191. vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
  192. vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
  193. vecSum1 = vecB + vecD;
  194. vecDiff1 = vecB - vecD;
  195. /* pre-load for next iteration */
  196. vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  197. vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
  198. vecTmp0 = vecSum0 + vecSum1;
  199. vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
  200. vecTmp0 = vecSum0 - vecSum1;
  201. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
  202. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  203. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
  204. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  205. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
  206. blkCnt--;
  207. }
  208. /*
  209. * End of last stage process
  210. */
  211. }
  212. static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
  213. {
  214. float32_t const *pCoefVec;
  215. float32_t const *pCoef = S->pTwiddle;
  216. float32_t *pIn0, *pIn1;
  217. uint32_t n2;
  218. uint32_t blkCnt;
  219. f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
  220. f32x4_t vecCmplxTmp, vecTw;
  221. n2 = fftLen >> 1;
  222. pIn0 = pSrc;
  223. pIn1 = pSrc + fftLen;
  224. pCoefVec = pCoef;
  225. blkCnt = n2 / 2;
  226. while (blkCnt > 0U)
  227. {
  228. vecIn0 = *(f32x4_t *) pIn0;
  229. vecIn1 = *(f32x4_t *) pIn1;
  230. vecTw = vld1q(pCoefVec);
  231. pCoefVec += 4;
  232. vecSum = vecIn0 + vecIn1;
  233. vecDiff = vecIn0 - vecIn1;
  234. vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
  235. vst1q(pIn0, vecSum);
  236. pIn0 += 4;
  237. vst1q(pIn1, vecCmplxTmp);
  238. pIn1 += 4;
  239. blkCnt--;
  240. }
  241. _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
  242. _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
  243. pIn0 = pSrc;
  244. }
  245. static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
  246. {
  247. f32x4_t vecTmp0, vecTmp1;
  248. f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
  249. f32x4_t vecA, vecB, vecC, vecD;
  250. uint32_t blkCnt;
  251. uint32_t n1, n2;
  252. uint32_t stage = 0;
  253. int32_t iter = 1;
  254. static const int32_t strides[4] = {
  255. (0 - 16) * (int32_t)sizeof(q31_t *),
  256. (1 - 16) * (int32_t)sizeof(q31_t *),
  257. (8 - 16) * (int32_t)sizeof(q31_t *),
  258. (9 - 16) * (int32_t)sizeof(q31_t *)
  259. };
  260. n2 = fftLen;
  261. n1 = n2;
  262. n2 >>= 2u;
  263. for (int k = fftLen / 4; k > 1; k >>= 2)
  264. {
  265. float32_t const *p_rearranged_twiddle_tab_stride1 =
  266. &S->rearranged_twiddle_stride1[
  267. S->rearranged_twiddle_tab_stride1_arr[stage]];
  268. float32_t const *p_rearranged_twiddle_tab_stride2 =
  269. &S->rearranged_twiddle_stride2[
  270. S->rearranged_twiddle_tab_stride2_arr[stage]];
  271. float32_t const *p_rearranged_twiddle_tab_stride3 =
  272. &S->rearranged_twiddle_stride3[
  273. S->rearranged_twiddle_tab_stride3_arr[stage]];
  274. float32_t * pBase = pSrc;
  275. for (int i = 0; i < iter; i++)
  276. {
  277. float32_t *inA = pBase;
  278. float32_t *inB = inA + n2 * CMPLX_DIM;
  279. float32_t *inC = inB + n2 * CMPLX_DIM;
  280. float32_t *inD = inC + n2 * CMPLX_DIM;
  281. float32_t const *pW1 = p_rearranged_twiddle_tab_stride1;
  282. float32_t const *pW2 = p_rearranged_twiddle_tab_stride2;
  283. float32_t const *pW3 = p_rearranged_twiddle_tab_stride3;
  284. f32x4_t vecW;
  285. blkCnt = n2 / 2;
  286. /*
  287. * load 2 f32 complex pair
  288. */
  289. vecA = vldrwq_f32(inA);
  290. vecC = vldrwq_f32(inC);
  291. while (blkCnt > 0U)
  292. {
  293. vecB = vldrwq_f32(inB);
  294. vecD = vldrwq_f32(inD);
  295. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  296. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  297. vecSum1 = vecB + vecD;
  298. vecDiff1 = vecB - vecD;
  299. /*
  300. * [ 1 1 1 1 ] * [ A B C D ]' .* 1
  301. */
  302. vecTmp0 = vecSum0 + vecSum1;
  303. vst1q(inA, vecTmp0);
  304. inA += 4;
  305. /*
  306. * [ 1 -1 1 -1 ] * [ A B C D ]'
  307. */
  308. vecTmp0 = vecSum0 - vecSum1;
  309. /*
  310. * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
  311. */
  312. vecW = vld1q(pW2);
  313. pW2 += 4;
  314. vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
  315. vst1q(inB, vecTmp1);
  316. inB += 4;
  317. /*
  318. * [ 1 -i -1 +i ] * [ A B C D ]'
  319. */
  320. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  321. /*
  322. * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
  323. */
  324. vecW = vld1q(pW1);
  325. pW1 += 4;
  326. vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
  327. vst1q(inC, vecTmp1);
  328. inC += 4;
  329. /*
  330. * [ 1 +i -1 -i ] * [ A B C D ]'
  331. */
  332. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  333. /*
  334. * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
  335. */
  336. vecW = vld1q(pW3);
  337. pW3 += 4;
  338. vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
  339. vst1q(inD, vecTmp1);
  340. inD += 4;
  341. vecA = vldrwq_f32(inA);
  342. vecC = vldrwq_f32(inC);
  343. blkCnt--;
  344. }
  345. pBase += CMPLX_DIM * n1;
  346. }
  347. n1 = n2;
  348. n2 >>= 2u;
  349. iter = iter << 2;
  350. stage++;
  351. }
  352. /*
  353. * start of Last stage process
  354. */
  355. uint32x4_t vecScGathAddr = vld1q_u32 ((uint32_t*)strides);
  356. vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
  357. /*
  358. * load scheduling
  359. */
  360. vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  361. vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
  362. blkCnt = (fftLen >> 3);
  363. while (blkCnt > 0U)
  364. {
  365. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  366. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  367. vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
  368. vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
  369. vecSum1 = vecB + vecD;
  370. vecDiff1 = vecB - vecD;
  371. vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  372. vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
  373. vecTmp0 = vecSum0 + vecSum1;
  374. vecTmp0 = vecTmp0 * onebyfftLen;
  375. vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
  376. vecTmp0 = vecSum0 - vecSum1;
  377. vecTmp0 = vecTmp0 * onebyfftLen;
  378. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
  379. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  380. vecTmp0 = vecTmp0 * onebyfftLen;
  381. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
  382. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  383. vecTmp0 = vecTmp0 * onebyfftLen;
  384. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
  385. blkCnt--;
  386. }
  387. /*
  388. * End of last stage process
  389. */
  390. }
  391. static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
  392. {
  393. float32_t const *pCoefVec;
  394. float32_t const *pCoef = S->pTwiddle;
  395. float32_t *pIn0, *pIn1;
  396. uint32_t n2;
  397. float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen);
  398. uint32_t blkCnt;
  399. f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
  400. f32x4_t vecCmplxTmp, vecTw;
  401. n2 = fftLen >> 1;
  402. pIn0 = pSrc;
  403. pIn1 = pSrc + fftLen;
  404. pCoefVec = pCoef;
  405. blkCnt = n2 / 2;
  406. while (blkCnt > 0U)
  407. {
  408. vecIn0 = *(f32x4_t *) pIn0;
  409. vecIn1 = *(f32x4_t *) pIn1;
  410. vecTw = vld1q(pCoefVec);
  411. pCoefVec += 4;
  412. vecSum = vecIn0 + vecIn1;
  413. vecDiff = vecIn0 - vecIn1;
  414. vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
  415. vst1q(pIn0, vecSum);
  416. pIn0 += 4;
  417. vst1q(pIn1, vecCmplxTmp);
  418. pIn1 += 4;
  419. blkCnt--;
  420. }
  421. _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
  422. _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
  423. }
  424. /**
  425. @addtogroup ComplexFFT
  426. @{
  427. */
  428. /**
  429. @brief Processing function for the floating-point complex FFT.
  430. @param[in] S points to an instance of the floating-point CFFT structure
  431. @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
  432. @param[in] ifftFlag flag that selects transform direction
  433. - value = 0: forward transform
  434. - value = 1: inverse transform
  435. @param[in] bitReverseFlag flag that enables / disables bit reversal of output
  436. - value = 0: disables bit reversal of output
  437. - value = 1: enables bit reversal of output
  438. @return none
  439. */
  440. void arm_cfft_f32(
  441. const arm_cfft_instance_f32 * S,
  442. float32_t * pSrc,
  443. uint8_t ifftFlag,
  444. uint8_t bitReverseFlag)
  445. {
  446. uint32_t fftLen = S->fftLen;
  447. if (ifftFlag == 1U) {
  448. switch (fftLen) {
  449. case 16:
  450. case 64:
  451. case 256:
  452. case 1024:
  453. case 4096:
  454. _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
  455. break;
  456. case 32:
  457. case 128:
  458. case 512:
  459. case 2048:
  460. arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
  461. break;
  462. }
  463. } else {
  464. switch (fftLen) {
  465. case 16:
  466. case 64:
  467. case 256:
  468. case 1024:
  469. case 4096:
  470. _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
  471. break;
  472. case 32:
  473. case 128:
  474. case 512:
  475. case 2048:
  476. arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
  477. break;
  478. }
  479. }
  480. if (bitReverseFlag)
  481. {
  482. arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
  483. }
  484. }
  485. #else
  486. extern void arm_radix8_butterfly_f32(
  487. float32_t * pSrc,
  488. uint16_t fftLen,
  489. const float32_t * pCoef,
  490. uint16_t twidCoefModifier);
  491. extern void arm_bitreversal_32(
  492. uint32_t * pSrc,
  493. const uint16_t bitRevLen,
  494. const uint16_t * pBitRevTable);
  495. /**
  496. @ingroup groupTransforms
  497. */
  498. /**
  499. @defgroup ComplexFFT Complex FFT Functions
  500. @par
  501. The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
  502. Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
  503. than the DFT, especially for long lengths.
  504. The algorithms described in this section
  505. operate on complex data. A separate set of functions is devoted to handling
  506. of real sequences.
  507. @par
  508. There are separate algorithms for handling floating-point, Q15, and Q31 data
  509. types. The algorithms available for each data type are described next.
  510. @par
  511. The FFT functions operate in-place. That is, the array holding the input data
  512. will also be used to hold the corresponding result. The input data is complex
  513. and contains <code>2*fftLen</code> interleaved values as shown below.
  514. <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
  515. The FFT result will be contained in the same array and the frequency domain
  516. values will have the same interleaving.
  517. @par Floating-point
  518. The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
  519. stages are performed along with a single radix-2 or radix-4 stage, as needed.
  520. The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
  521. a different twiddle factor table.
  522. @par
  523. The function uses the standard FFT definition and output values may grow by a
  524. factor of <code>fftLen</code> when computing the forward transform. The
  525. inverse transform includes a scale of <code>1/fftLen</code> as part of the
  526. calculation and this matches the textbook definition of the inverse FFT.
  527. @par
  528. For the MVE version, the new arm_cfft_init_f32 initialization function is
  529. <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
  530. needed FFTs.</b> Other FFT versions can continue to be initialized as
  531. explained below.
  532. @par
  533. For not MVE versions, pre-initialized data structures containing twiddle factors
  534. and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
  535. this header in your function and then pass one of the constant structures as
  536. an argument to arm_cfft_f32. For example:
  537. @par
  538. <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
  539. @par
  540. computes a 64-point inverse complex FFT including bit reversal.
  541. The data structures are treated as constant data and not modified during the
  542. calculation. The same data structure can be reused for multiple transforms
  543. including mixing forward and inverse transforms.
  544. @par
  545. Earlier releases of the library provided separate radix-2 and radix-4
  546. algorithms that operated on floating-point data. These functions are still
  547. provided but are deprecated. The older functions are slower and less general
  548. than the new functions.
  549. @par
  550. An example of initialization of the constants for the arm_cfft_f32 function follows:
  551. @code
  552. const static arm_cfft_instance_f32 *S;
  553. ...
  554. switch (length) {
  555. case 16:
  556. S = &arm_cfft_sR_f32_len16;
  557. break;
  558. case 32:
  559. S = &arm_cfft_sR_f32_len32;
  560. break;
  561. case 64:
  562. S = &arm_cfft_sR_f32_len64;
  563. break;
  564. case 128:
  565. S = &arm_cfft_sR_f32_len128;
  566. break;
  567. case 256:
  568. S = &arm_cfft_sR_f32_len256;
  569. break;
  570. case 512:
  571. S = &arm_cfft_sR_f32_len512;
  572. break;
  573. case 1024:
  574. S = &arm_cfft_sR_f32_len1024;
  575. break;
  576. case 2048:
  577. S = &arm_cfft_sR_f32_len2048;
  578. break;
  579. case 4096:
  580. S = &arm_cfft_sR_f32_len4096;
  581. break;
  582. }
  583. @endcode
  584. @par
  585. The new arm_cfft_init_f32 can also be used.
  586. @par Q15 and Q31
  587. The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
  588. stages are performed along with a single radix-2 stage, as needed.
  589. The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
  590. a different twiddle factor table.
  591. @par
  592. The function uses the standard FFT definition and output values may grow by a
  593. factor of <code>fftLen</code> when computing the forward transform. The
  594. inverse transform includes a scale of <code>1/fftLen</code> as part of the
  595. calculation and this matches the textbook definition of the inverse FFT.
  596. @par
  597. Pre-initialized data structures containing twiddle factors and bit reversal
  598. tables are provided and defined in <code>arm_const_structs.h</code>. Include
  599. this header in your function and then pass one of the constant structures as
  600. an argument to arm_cfft_q31. For example:
  601. @par
  602. <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
  603. @par
  604. computes a 64-point inverse complex FFT including bit reversal.
  605. The data structures are treated as constant data and not modified during the
  606. calculation. The same data structure can be reused for multiple transforms
  607. including mixing forward and inverse transforms.
  608. @par
  609. Earlier releases of the library provided separate radix-2 and radix-4
  610. algorithms that operated on floating-point data. These functions are still
  611. provided but are deprecated. The older functions are slower and less general
  612. than the new functions.
  613. @par
  614. An example of initialization of the constants for the arm_cfft_q31 function follows:
  615. @code
  616. const static arm_cfft_instance_q31 *S;
  617. ...
  618. switch (length) {
  619. case 16:
  620. S = &arm_cfft_sR_q31_len16;
  621. break;
  622. case 32:
  623. S = &arm_cfft_sR_q31_len32;
  624. break;
  625. case 64:
  626. S = &arm_cfft_sR_q31_len64;
  627. break;
  628. case 128:
  629. S = &arm_cfft_sR_q31_len128;
  630. break;
  631. case 256:
  632. S = &arm_cfft_sR_q31_len256;
  633. break;
  634. case 512:
  635. S = &arm_cfft_sR_q31_len512;
  636. break;
  637. case 1024:
  638. S = &arm_cfft_sR_q31_len1024;
  639. break;
  640. case 2048:
  641. S = &arm_cfft_sR_q31_len2048;
  642. break;
  643. case 4096:
  644. S = &arm_cfft_sR_q31_len4096;
  645. break;
  646. }
  647. @endcode
  648. */
  649. void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
  650. {
  651. uint32_t L = S->fftLen;
  652. float32_t * pCol1, * pCol2, * pMid1, * pMid2;
  653. float32_t * p2 = p1 + L;
  654. const float32_t * tw = (float32_t *) S->pTwiddle;
  655. float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
  656. float32_t m0, m1, m2, m3;
  657. uint32_t l;
  658. pCol1 = p1;
  659. pCol2 = p2;
  660. /* Define new length */
  661. L >>= 1;
  662. /* Initialize mid pointers */
  663. pMid1 = p1 + L;
  664. pMid2 = p2 + L;
  665. /* do two dot Fourier transform */
  666. for (l = L >> 2; l > 0; l-- )
  667. {
  668. t1[0] = p1[0];
  669. t1[1] = p1[1];
  670. t1[2] = p1[2];
  671. t1[3] = p1[3];
  672. t2[0] = p2[0];
  673. t2[1] = p2[1];
  674. t2[2] = p2[2];
  675. t2[3] = p2[3];
  676. t3[0] = pMid1[0];
  677. t3[1] = pMid1[1];
  678. t3[2] = pMid1[2];
  679. t3[3] = pMid1[3];
  680. t4[0] = pMid2[0];
  681. t4[1] = pMid2[1];
  682. t4[2] = pMid2[2];
  683. t4[3] = pMid2[3];
  684. *p1++ = t1[0] + t2[0];
  685. *p1++ = t1[1] + t2[1];
  686. *p1++ = t1[2] + t2[2];
  687. *p1++ = t1[3] + t2[3]; /* col 1 */
  688. t2[0] = t1[0] - t2[0];
  689. t2[1] = t1[1] - t2[1];
  690. t2[2] = t1[2] - t2[2];
  691. t2[3] = t1[3] - t2[3]; /* for col 2 */
  692. *pMid1++ = t3[0] + t4[0];
  693. *pMid1++ = t3[1] + t4[1];
  694. *pMid1++ = t3[2] + t4[2];
  695. *pMid1++ = t3[3] + t4[3]; /* col 1 */
  696. t4[0] = t4[0] - t3[0];
  697. t4[1] = t4[1] - t3[1];
  698. t4[2] = t4[2] - t3[2];
  699. t4[3] = t4[3] - t3[3]; /* for col 2 */
  700. twR = *tw++;
  701. twI = *tw++;
  702. /* multiply by twiddle factors */
  703. m0 = t2[0] * twR;
  704. m1 = t2[1] * twI;
  705. m2 = t2[1] * twR;
  706. m3 = t2[0] * twI;
  707. /* R = R * Tr - I * Ti */
  708. *p2++ = m0 + m1;
  709. /* I = I * Tr + R * Ti */
  710. *p2++ = m2 - m3;
  711. /* use vertical symmetry */
  712. /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
  713. m0 = t4[0] * twI;
  714. m1 = t4[1] * twR;
  715. m2 = t4[1] * twI;
  716. m3 = t4[0] * twR;
  717. *pMid2++ = m0 - m1;
  718. *pMid2++ = m2 + m3;
  719. twR = *tw++;
  720. twI = *tw++;
  721. m0 = t2[2] * twR;
  722. m1 = t2[3] * twI;
  723. m2 = t2[3] * twR;
  724. m3 = t2[2] * twI;
  725. *p2++ = m0 + m1;
  726. *p2++ = m2 - m3;
  727. m0 = t4[2] * twI;
  728. m1 = t4[3] * twR;
  729. m2 = t4[3] * twI;
  730. m3 = t4[2] * twR;
  731. *pMid2++ = m0 - m1;
  732. *pMid2++ = m2 + m3;
  733. }
  734. /* first col */
  735. arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
  736. /* second col */
  737. arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
  738. }
  739. void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
  740. {
  741. uint32_t L = S->fftLen >> 1;
  742. float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
  743. const float32_t *tw2, *tw3, *tw4;
  744. float32_t * p2 = p1 + L;
  745. float32_t * p3 = p2 + L;
  746. float32_t * p4 = p3 + L;
  747. float32_t t2[4], t3[4], t4[4], twR, twI;
  748. float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
  749. float32_t m0, m1, m2, m3;
  750. uint32_t l, twMod2, twMod3, twMod4;
  751. pCol1 = p1; /* points to real values by default */
  752. pCol2 = p2;
  753. pCol3 = p3;
  754. pCol4 = p4;
  755. pEnd1 = p2 - 1; /* points to imaginary values by default */
  756. pEnd2 = p3 - 1;
  757. pEnd3 = p4 - 1;
  758. pEnd4 = pEnd3 + L;
  759. tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
  760. L >>= 1;
  761. /* do four dot Fourier transform */
  762. twMod2 = 2;
  763. twMod3 = 4;
  764. twMod4 = 6;
  765. /* TOP */
  766. p1ap3_0 = p1[0] + p3[0];
  767. p1sp3_0 = p1[0] - p3[0];
  768. p1ap3_1 = p1[1] + p3[1];
  769. p1sp3_1 = p1[1] - p3[1];
  770. /* col 2 */
  771. t2[0] = p1sp3_0 + p2[1] - p4[1];
  772. t2[1] = p1sp3_1 - p2[0] + p4[0];
  773. /* col 3 */
  774. t3[0] = p1ap3_0 - p2[0] - p4[0];
  775. t3[1] = p1ap3_1 - p2[1] - p4[1];
  776. /* col 4 */
  777. t4[0] = p1sp3_0 - p2[1] + p4[1];
  778. t4[1] = p1sp3_1 + p2[0] - p4[0];
  779. /* col 1 */
  780. *p1++ = p1ap3_0 + p2[0] + p4[0];
  781. *p1++ = p1ap3_1 + p2[1] + p4[1];
  782. /* Twiddle factors are ones */
  783. *p2++ = t2[0];
  784. *p2++ = t2[1];
  785. *p3++ = t3[0];
  786. *p3++ = t3[1];
  787. *p4++ = t4[0];
  788. *p4++ = t4[1];
  789. tw2 += twMod2;
  790. tw3 += twMod3;
  791. tw4 += twMod4;
  792. for (l = (L - 2) >> 1; l > 0; l-- )
  793. {
  794. /* TOP */
  795. p1ap3_0 = p1[0] + p3[0];
  796. p1sp3_0 = p1[0] - p3[0];
  797. p1ap3_1 = p1[1] + p3[1];
  798. p1sp3_1 = p1[1] - p3[1];
  799. /* col 2 */
  800. t2[0] = p1sp3_0 + p2[1] - p4[1];
  801. t2[1] = p1sp3_1 - p2[0] + p4[0];
  802. /* col 3 */
  803. t3[0] = p1ap3_0 - p2[0] - p4[0];
  804. t3[1] = p1ap3_1 - p2[1] - p4[1];
  805. /* col 4 */
  806. t4[0] = p1sp3_0 - p2[1] + p4[1];
  807. t4[1] = p1sp3_1 + p2[0] - p4[0];
  808. /* col 1 - top */
  809. *p1++ = p1ap3_0 + p2[0] + p4[0];
  810. *p1++ = p1ap3_1 + p2[1] + p4[1];
  811. /* BOTTOM */
  812. p1ap3_1 = pEnd1[-1] + pEnd3[-1];
  813. p1sp3_1 = pEnd1[-1] - pEnd3[-1];
  814. p1ap3_0 = pEnd1[ 0] + pEnd3[0];
  815. p1sp3_0 = pEnd1[ 0] - pEnd3[0];
  816. /* col 2 */
  817. t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
  818. t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
  819. /* col 3 */
  820. t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
  821. t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
  822. /* col 4 */
  823. t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
  824. t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
  825. /* col 1 - Bottom */
  826. *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
  827. *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
  828. /* COL 2 */
  829. /* read twiddle factors */
  830. twR = *tw2++;
  831. twI = *tw2++;
  832. /* multiply by twiddle factors */
  833. /* let Z1 = a + i(b), Z2 = c + i(d) */
  834. /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */
  835. /* Top */
  836. m0 = t2[0] * twR;
  837. m1 = t2[1] * twI;
  838. m2 = t2[1] * twR;
  839. m3 = t2[0] * twI;
  840. *p2++ = m0 + m1;
  841. *p2++ = m2 - m3;
  842. /* use vertical symmetry col 2 */
  843. /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
  844. /* Bottom */
  845. m0 = t2[3] * twI;
  846. m1 = t2[2] * twR;
  847. m2 = t2[2] * twI;
  848. m3 = t2[3] * twR;
  849. *pEnd2-- = m0 - m1;
  850. *pEnd2-- = m2 + m3;
  851. /* COL 3 */
  852. twR = tw3[0];
  853. twI = tw3[1];
  854. tw3 += twMod3;
  855. /* Top */
  856. m0 = t3[0] * twR;
  857. m1 = t3[1] * twI;
  858. m2 = t3[1] * twR;
  859. m3 = t3[0] * twI;
  860. *p3++ = m0 + m1;
  861. *p3++ = m2 - m3;
  862. /* use vertical symmetry col 3 */
  863. /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
  864. /* Bottom */
  865. m0 = -t3[3] * twR;
  866. m1 = t3[2] * twI;
  867. m2 = t3[2] * twR;
  868. m3 = t3[3] * twI;
  869. *pEnd3-- = m0 - m1;
  870. *pEnd3-- = m3 - m2;
  871. /* COL 4 */
  872. twR = tw4[0];
  873. twI = tw4[1];
  874. tw4 += twMod4;
  875. /* Top */
  876. m0 = t4[0] * twR;
  877. m1 = t4[1] * twI;
  878. m2 = t4[1] * twR;
  879. m3 = t4[0] * twI;
  880. *p4++ = m0 + m1;
  881. *p4++ = m2 - m3;
  882. /* use vertical symmetry col 4 */
  883. /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
  884. /* Bottom */
  885. m0 = t4[3] * twI;
  886. m1 = t4[2] * twR;
  887. m2 = t4[2] * twI;
  888. m3 = t4[3] * twR;
  889. *pEnd4-- = m0 - m1;
  890. *pEnd4-- = m2 + m3;
  891. }
  892. /* MIDDLE */
  893. /* Twiddle factors are */
  894. /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
  895. p1ap3_0 = p1[0] + p3[0];
  896. p1sp3_0 = p1[0] - p3[0];
  897. p1ap3_1 = p1[1] + p3[1];
  898. p1sp3_1 = p1[1] - p3[1];
  899. /* col 2 */
  900. t2[0] = p1sp3_0 + p2[1] - p4[1];
  901. t2[1] = p1sp3_1 - p2[0] + p4[0];
  902. /* col 3 */
  903. t3[0] = p1ap3_0 - p2[0] - p4[0];
  904. t3[1] = p1ap3_1 - p2[1] - p4[1];
  905. /* col 4 */
  906. t4[0] = p1sp3_0 - p2[1] + p4[1];
  907. t4[1] = p1sp3_1 + p2[0] - p4[0];
  908. /* col 1 - Top */
  909. *p1++ = p1ap3_0 + p2[0] + p4[0];
  910. *p1++ = p1ap3_1 + p2[1] + p4[1];
  911. /* COL 2 */
  912. twR = tw2[0];
  913. twI = tw2[1];
  914. m0 = t2[0] * twR;
  915. m1 = t2[1] * twI;
  916. m2 = t2[1] * twR;
  917. m3 = t2[0] * twI;
  918. *p2++ = m0 + m1;
  919. *p2++ = m2 - m3;
  920. /* COL 3 */
  921. twR = tw3[0];
  922. twI = tw3[1];
  923. m0 = t3[0] * twR;
  924. m1 = t3[1] * twI;
  925. m2 = t3[1] * twR;
  926. m3 = t3[0] * twI;
  927. *p3++ = m0 + m1;
  928. *p3++ = m2 - m3;
  929. /* COL 4 */
  930. twR = tw4[0];
  931. twI = tw4[1];
  932. m0 = t4[0] * twR;
  933. m1 = t4[1] * twI;
  934. m2 = t4[1] * twR;
  935. m3 = t4[0] * twI;
  936. *p4++ = m0 + m1;
  937. *p4++ = m2 - m3;
  938. /* first col */
  939. arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
  940. /* second col */
  941. arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
  942. /* third col */
  943. arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
  944. /* fourth col */
  945. arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
  946. }
  947. /**
  948. @addtogroup ComplexFFT
  949. @{
  950. */
  951. /**
  952. @brief Processing function for the floating-point complex FFT.
  953. @param[in] S points to an instance of the floating-point CFFT structure
  954. @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
  955. @param[in] ifftFlag flag that selects transform direction
  956. - value = 0: forward transform
  957. - value = 1: inverse transform
  958. @param[in] bitReverseFlag flag that enables / disables bit reversal of output
  959. - value = 0: disables bit reversal of output
  960. - value = 1: enables bit reversal of output
  961. @return none
  962. */
  963. void arm_cfft_f32(
  964. const arm_cfft_instance_f32 * S,
  965. float32_t * p1,
  966. uint8_t ifftFlag,
  967. uint8_t bitReverseFlag)
  968. {
  969. uint32_t L = S->fftLen, l;
  970. float32_t invL, * pSrc;
  971. if (ifftFlag == 1U)
  972. {
  973. /* Conjugate input data */
  974. pSrc = p1 + 1;
  975. for (l = 0; l < L; l++)
  976. {
  977. *pSrc = -*pSrc;
  978. pSrc += 2;
  979. }
  980. }
  981. switch (L)
  982. {
  983. case 16:
  984. case 128:
  985. case 1024:
  986. arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
  987. break;
  988. case 32:
  989. case 256:
  990. case 2048:
  991. arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
  992. break;
  993. case 64:
  994. case 512:
  995. case 4096:
  996. arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
  997. break;
  998. }
  999. if ( bitReverseFlag )
  1000. arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
  1001. if (ifftFlag == 1U)
  1002. {
  1003. invL = 1.0f / (float32_t)L;
  1004. /* Conjugate and scale output data */
  1005. pSrc = p1;
  1006. for (l= 0; l < L; l++)
  1007. {
  1008. *pSrc++ *= invL ;
  1009. *pSrc = -(*pSrc) * invL;
  1010. pSrc++;
  1011. }
  1012. }
  1013. }
  1014. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  1015. /**
  1016. @} end of ComplexFFT group
  1017. */