arm_cfft_f16.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_cfft_f32.c
  4. * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
  5. *
  6. * $Date: 18. March 2019
  7. * $Revision: V1.6.0
  8. *
  9. * Target Processor: Cortex-M cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "dsp/transform_functions_f16.h"
  29. #include "arm_common_tables_f16.h"
  30. #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
  31. #include "arm_helium_utils.h"
  32. #include "arm_vec_fft.h"
  33. #include "arm_mve_tables_f16.h"
  34. static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
  35. {
  36. float16_t retValue=1.0;
  37. switch (fftLen)
  38. {
  39. case 4096U:
  40. retValue = (float16_t)0.000244140625f;
  41. break;
  42. case 2048U:
  43. retValue = (float16_t)0.00048828125f;
  44. break;
  45. case 1024U:
  46. retValue = (float16_t)0.0009765625f;
  47. break;
  48. case 512U:
  49. retValue = (float16_t)0.001953125f;
  50. break;
  51. case 256U:
  52. retValue = (float16_t)0.00390625f;
  53. break;
  54. case 128U:
  55. retValue = (float16_t)0.0078125f;
  56. break;
  57. case 64U:
  58. retValue = (float16_t)0.015625f;
  59. break;
  60. case 32U:
  61. retValue = (float16_t)0.03125f;
  62. break;
  63. case 16U:
  64. retValue = (float16_t)0.0625f;
  65. break;
  66. default:
  67. break;
  68. }
  69. return(retValue);
  70. }
  71. static void arm_bitreversal_f16_inpl_mve(
  72. uint16_t *pSrc,
  73. const uint16_t bitRevLen,
  74. const uint16_t *pBitRevTab)
  75. {
  76. uint32_t *src = (uint32_t *)pSrc;
  77. uint32_t blkCnt; /* loop counters */
  78. uint32x4_t bitRevTabOff;
  79. uint16x8_t one = vdupq_n_u16(1);
  80. blkCnt = (bitRevLen / 2) / 4;
  81. while (blkCnt > 0U) {
  82. bitRevTabOff = vldrhq_u16(pBitRevTab);
  83. pBitRevTab += 8;
  84. uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
  85. uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
  86. bitRevOff1 = bitRevOff1 >> 3;
  87. bitRevOff2 = bitRevOff2 >> 3;
  88. uint32x4_t in1 = vldrwq_gather_shifted_offset_u32(src, bitRevOff1);
  89. uint32x4_t in2 = vldrwq_gather_shifted_offset_u32(src, bitRevOff2);
  90. vstrwq_scatter_shifted_offset_u32(src, bitRevOff1, in2);
  91. vstrwq_scatter_shifted_offset_u32(src, bitRevOff2, in1);
  92. /*
  93. * Decrement the blockSize loop counter
  94. */
  95. blkCnt--;
  96. }
  97. /*
  98. * tail
  99. * (will be merged thru tail predication)
  100. */
  101. blkCnt = bitRevLen & 7;
  102. if (blkCnt > 0U) {
  103. mve_pred16_t p0 = vctp16q(blkCnt);
  104. bitRevTabOff = vldrhq_z_u16(pBitRevTab, p0);
  105. uint32x4_t bitRevOff1 = vmullbq_int_u16(bitRevTabOff, one);
  106. uint32x4_t bitRevOff2 = vmulltq_int_u16(bitRevTabOff, one);
  107. bitRevOff1 = bitRevOff1 >> 3;
  108. bitRevOff2 = bitRevOff2 >> 3;
  109. uint32x4_t in1 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff1, p0);
  110. uint32x4_t in2 = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff2, p0);
  111. vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff1, in2, p0);
  112. vstrwq_scatter_shifted_offset_p_u32(src, bitRevOff2, in1, p0);
  113. }
  114. }
  115. static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
  116. {
  117. f16x8_t vecTmp0, vecTmp1;
  118. f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
  119. f16x8_t vecA, vecB, vecC, vecD;
  120. uint32_t blkCnt;
  121. uint32_t n1, n2;
  122. uint32_t stage = 0;
  123. int32_t iter = 1;
  124. static const uint32_t strides[4] =
  125. {(0 - 16) * sizeof(float16_t *)
  126. , (4 - 16) * sizeof(float16_t *)
  127. , (8 - 16) * sizeof(float16_t *)
  128. , (12 - 16) * sizeof(float16_t *)};
  129. n2 = fftLen;
  130. n1 = n2;
  131. n2 >>= 2u;
  132. for (int k = fftLen / 4u; k > 1; k >>= 2)
  133. {
  134. for (int i = 0; i < iter; i++)
  135. {
  136. float16_t const *p_rearranged_twiddle_tab_stride1 =
  137. &S->rearranged_twiddle_stride1[
  138. S->rearranged_twiddle_tab_stride1_arr[stage]];
  139. float16_t const *p_rearranged_twiddle_tab_stride2 =
  140. &S->rearranged_twiddle_stride2[
  141. S->rearranged_twiddle_tab_stride2_arr[stage]];
  142. float16_t const *p_rearranged_twiddle_tab_stride3 =
  143. &S->rearranged_twiddle_stride3[
  144. S->rearranged_twiddle_tab_stride3_arr[stage]];
  145. float16_t const *pW1, *pW2, *pW3;
  146. float16_t *inA = pSrc + CMPLX_DIM * i * n1;
  147. float16_t *inB = inA + n2 * CMPLX_DIM;
  148. float16_t *inC = inB + n2 * CMPLX_DIM;
  149. float16_t *inD = inC + n2 * CMPLX_DIM;
  150. f16x8_t vecW;
  151. pW1 = p_rearranged_twiddle_tab_stride1;
  152. pW2 = p_rearranged_twiddle_tab_stride2;
  153. pW3 = p_rearranged_twiddle_tab_stride3;
  154. blkCnt = n2 / 4;
  155. /*
  156. * load 2 f16 complex pair
  157. */
  158. vecA = vldrhq_f16(inA);
  159. vecC = vldrhq_f16(inC);
  160. while (blkCnt > 0U)
  161. {
  162. vecB = vldrhq_f16(inB);
  163. vecD = vldrhq_f16(inD);
  164. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  165. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  166. vecSum1 = vecB + vecD;
  167. vecDiff1 = vecB - vecD;
  168. /*
  169. * [ 1 1 1 1 ] * [ A B C D ]' .* 1
  170. */
  171. vecTmp0 = vecSum0 + vecSum1;
  172. vst1q(inA, vecTmp0);
  173. inA += 8;
  174. /*
  175. * [ 1 -1 1 -1 ] * [ A B C D ]'
  176. */
  177. vecTmp0 = vecSum0 - vecSum1;
  178. /*
  179. * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
  180. */
  181. vecW = vld1q(pW2);
  182. pW2 += 8;
  183. vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
  184. vst1q(inB, vecTmp1);
  185. inB += 8;
  186. /*
  187. * [ 1 -i -1 +i ] * [ A B C D ]'
  188. */
  189. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  190. /*
  191. * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
  192. */
  193. vecW = vld1q(pW1);
  194. pW1 +=8;
  195. vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
  196. vst1q(inC, vecTmp1);
  197. inC += 8;
  198. /*
  199. * [ 1 +i -1 -i ] * [ A B C D ]'
  200. */
  201. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  202. /*
  203. * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
  204. */
  205. vecW = vld1q(pW3);
  206. pW3 += 8;
  207. vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
  208. vst1q(inD, vecTmp1);
  209. inD += 8;
  210. vecA = vldrhq_f16(inA);
  211. vecC = vldrhq_f16(inC);
  212. blkCnt--;
  213. }
  214. }
  215. n1 = n2;
  216. n2 >>= 2u;
  217. iter = iter << 2;
  218. stage++;
  219. }
  220. /*
  221. * start of Last stage process
  222. */
  223. uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
  224. vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
  225. /* load scheduling */
  226. vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  227. vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
  228. blkCnt = (fftLen >> 4);
  229. while (blkCnt > 0U)
  230. {
  231. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  232. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  233. vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
  234. vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
  235. vecSum1 = vecB + vecD;
  236. vecDiff1 = vecB - vecD;
  237. /* pre-load for next iteration */
  238. vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  239. vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
  240. vecTmp0 = vecSum0 + vecSum1;
  241. vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
  242. vecTmp0 = vecSum0 - vecSum1;
  243. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
  244. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  245. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
  246. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  247. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
  248. blkCnt--;
  249. }
  250. /*
  251. * End of last stage process
  252. */
  253. }
  254. static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
  255. {
  256. float16_t const *pCoefVec;
  257. float16_t const *pCoef = S->pTwiddle;
  258. float16_t *pIn0, *pIn1;
  259. uint32_t n2;
  260. uint32_t blkCnt;
  261. f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
  262. f16x8_t vecCmplxTmp, vecTw;
  263. n2 = fftLen >> 1;
  264. pIn0 = pSrc;
  265. pIn1 = pSrc + fftLen;
  266. pCoefVec = pCoef;
  267. blkCnt = n2 / 4;
  268. while (blkCnt > 0U)
  269. {
  270. vecIn0 = *(f16x8_t *) pIn0;
  271. vecIn1 = *(f16x8_t *) pIn1;
  272. vecTw = vld1q(pCoefVec);
  273. pCoefVec += 8;
  274. vecSum = vaddq(vecIn0, vecIn1);
  275. vecDiff = vsubq(vecIn0, vecIn1);
  276. vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
  277. vst1q(pIn0, vecSum);
  278. pIn0 += 8;
  279. vst1q(pIn1, vecCmplxTmp);
  280. pIn1 += 8;
  281. blkCnt--;
  282. }
  283. _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
  284. _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
  285. pIn0 = pSrc;
  286. }
  287. static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
  288. {
  289. f16x8_t vecTmp0, vecTmp1;
  290. f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
  291. f16x8_t vecA, vecB, vecC, vecD;
  292. f16x8_t vecW;
  293. uint32_t blkCnt;
  294. uint32_t n1, n2;
  295. uint32_t stage = 0;
  296. int32_t iter = 1;
  297. static const uint32_t strides[4] = {
  298. (0 - 16) * sizeof(q31_t *),
  299. (4 - 16) * sizeof(q31_t *),
  300. (8 - 16) * sizeof(q31_t *),
  301. (12 - 16) * sizeof(q31_t *)
  302. };
  303. n2 = fftLen;
  304. n1 = n2;
  305. n2 >>= 2u;
  306. for (int k = fftLen / 4; k > 1; k >>= 2)
  307. {
  308. for (int i = 0; i < iter; i++)
  309. {
  310. float16_t const *p_rearranged_twiddle_tab_stride1 =
  311. &S->rearranged_twiddle_stride1[
  312. S->rearranged_twiddle_tab_stride1_arr[stage]];
  313. float16_t const *p_rearranged_twiddle_tab_stride2 =
  314. &S->rearranged_twiddle_stride2[
  315. S->rearranged_twiddle_tab_stride2_arr[stage]];
  316. float16_t const *p_rearranged_twiddle_tab_stride3 =
  317. &S->rearranged_twiddle_stride3[
  318. S->rearranged_twiddle_tab_stride3_arr[stage]];
  319. float16_t const *pW1, *pW2, *pW3;
  320. float16_t *inA = pSrc + CMPLX_DIM * i * n1;
  321. float16_t *inB = inA + n2 * CMPLX_DIM;
  322. float16_t *inC = inB + n2 * CMPLX_DIM;
  323. float16_t *inD = inC + n2 * CMPLX_DIM;
  324. pW1 = p_rearranged_twiddle_tab_stride1;
  325. pW2 = p_rearranged_twiddle_tab_stride2;
  326. pW3 = p_rearranged_twiddle_tab_stride3;
  327. blkCnt = n2 / 4;
  328. /*
  329. * load 2 f32 complex pair
  330. */
  331. vecA = vldrhq_f16(inA);
  332. vecC = vldrhq_f16(inC);
  333. while (blkCnt > 0U)
  334. {
  335. vecB = vldrhq_f16(inB);
  336. vecD = vldrhq_f16(inD);
  337. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  338. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  339. vecSum1 = vecB + vecD;
  340. vecDiff1 = vecB - vecD;
  341. /*
  342. * [ 1 1 1 1 ] * [ A B C D ]' .* 1
  343. */
  344. vecTmp0 = vecSum0 + vecSum1;
  345. vst1q(inA, vecTmp0);
  346. inA += 8;
  347. /*
  348. * [ 1 -1 1 -1 ] * [ A B C D ]'
  349. */
  350. vecTmp0 = vecSum0 - vecSum1;
  351. /*
  352. * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
  353. */
  354. vecW = vld1q(pW2);
  355. pW2 += 8;
  356. vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
  357. vst1q(inB, vecTmp1);
  358. inB += 8;
  359. /*
  360. * [ 1 -i -1 +i ] * [ A B C D ]'
  361. */
  362. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  363. /*
  364. * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
  365. */
  366. vecW = vld1q(pW1);
  367. pW1 += 8;
  368. vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
  369. vst1q(inC, vecTmp1);
  370. inC += 8;
  371. /*
  372. * [ 1 +i -1 -i ] * [ A B C D ]'
  373. */
  374. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  375. /*
  376. * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
  377. */
  378. vecW = vld1q(pW3);
  379. pW3 += 8;
  380. vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
  381. vst1q(inD, vecTmp1);
  382. inD += 8;
  383. vecA = vldrhq_f16(inA);
  384. vecC = vldrhq_f16(inC);
  385. blkCnt--;
  386. }
  387. }
  388. n1 = n2;
  389. n2 >>= 2u;
  390. iter = iter << 2;
  391. stage++;
  392. }
  393. /*
  394. * start of Last stage process
  395. */
  396. uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
  397. vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
  398. /*
  399. * load scheduling
  400. */
  401. vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  402. vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
  403. blkCnt = (fftLen >> 4);
  404. while (blkCnt > 0U)
  405. {
  406. vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
  407. vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
  408. vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
  409. vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
  410. vecSum1 = vecB + vecD;
  411. vecDiff1 = vecB - vecD;
  412. vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
  413. vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
  414. vecTmp0 = vecSum0 + vecSum1;
  415. vecTmp0 = vecTmp0 * onebyfftLen;
  416. vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
  417. vecTmp0 = vecSum0 - vecSum1;
  418. vecTmp0 = vecTmp0 * onebyfftLen;
  419. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
  420. vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
  421. vecTmp0 = vecTmp0 * onebyfftLen;
  422. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
  423. vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
  424. vecTmp0 = vecTmp0 * onebyfftLen;
  425. vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
  426. blkCnt--;
  427. }
  428. /*
  429. * End of last stage process
  430. */
  431. }
  432. static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
  433. {
  434. float16_t const *pCoefVec;
  435. float16_t const *pCoef = S->pTwiddle;
  436. float16_t *pIn0, *pIn1;
  437. uint32_t n2;
  438. float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen);
  439. uint32_t blkCnt;
  440. f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
  441. f16x8_t vecCmplxTmp, vecTw;
  442. n2 = fftLen >> 1;
  443. pIn0 = pSrc;
  444. pIn1 = pSrc + fftLen;
  445. pCoefVec = pCoef;
  446. blkCnt = n2 / 4;
  447. while (blkCnt > 0U)
  448. {
  449. vecIn0 = *(f16x8_t *) pIn0;
  450. vecIn1 = *(f16x8_t *) pIn1;
  451. vecTw = vld1q(pCoefVec);
  452. pCoefVec += 8;
  453. vecSum = vaddq(vecIn0, vecIn1);
  454. vecDiff = vsubq(vecIn0, vecIn1);
  455. vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
  456. vst1q(pIn0, vecSum);
  457. pIn0 += 8;
  458. vst1q(pIn1, vecCmplxTmp);
  459. pIn1 += 8;
  460. blkCnt--;
  461. }
  462. _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
  463. _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
  464. }
  465. /**
  466. @addtogroup ComplexFFT
  467. @{
  468. */
  469. /**
  470. @brief Processing function for the floating-point complex FFT.
  471. @param[in] S points to an instance of the floating-point CFFT structure
  472. @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
  473. @param[in] ifftFlag flag that selects transform direction
  474. - value = 0: forward transform
  475. - value = 1: inverse transform
  476. @param[in] bitReverseFlag flag that enables / disables bit reversal of output
  477. - value = 0: disables bit reversal of output
  478. - value = 1: enables bit reversal of output
  479. @return none
  480. */
  481. void arm_cfft_f16(
  482. const arm_cfft_instance_f16 * S,
  483. float16_t * pSrc,
  484. uint8_t ifftFlag,
  485. uint8_t bitReverseFlag)
  486. {
  487. uint32_t fftLen = S->fftLen;
  488. if (ifftFlag == 1U) {
  489. switch (fftLen) {
  490. case 16:
  491. case 64:
  492. case 256:
  493. case 1024:
  494. case 4096:
  495. _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
  496. break;
  497. case 32:
  498. case 128:
  499. case 512:
  500. case 2048:
  501. arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
  502. break;
  503. }
  504. } else {
  505. switch (fftLen) {
  506. case 16:
  507. case 64:
  508. case 256:
  509. case 1024:
  510. case 4096:
  511. _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
  512. break;
  513. case 32:
  514. case 128:
  515. case 512:
  516. case 2048:
  517. arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
  518. break;
  519. }
  520. }
  521. if (bitReverseFlag)
  522. {
  523. arm_bitreversal_f16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
  524. }
  525. }
  526. #else
  527. #if defined(ARM_FLOAT16_SUPPORTED)
  528. extern void arm_bitreversal_16(
  529. uint16_t * pSrc,
  530. const uint16_t bitRevLen,
  531. const uint16_t * pBitRevTable);
  532. extern void arm_cfft_radix4by2_f16(
  533. float16_t * pSrc,
  534. uint32_t fftLen,
  535. const float16_t * pCoef);
  536. extern void arm_radix4_butterfly_f16(
  537. float16_t * pSrc,
  538. uint16_t fftLen,
  539. const float16_t * pCoef,
  540. uint16_t twidCoefModifier);
  541. /**
  542. @ingroup groupTransforms
  543. */
  544. /**
  545. @defgroup ComplexFFT Complex FFT Functions
  546. @par
  547. The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
  548. Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
  549. than the DFT, especially for long lengths.
  550. The algorithms described in this section
  551. operate on complex data. A separate set of functions is devoted to handling
  552. of real sequences.
  553. @par
  554. There are separate algorithms for handling floating-point, Q15, and Q31 data
  555. types. The algorithms available for each data type are described next.
  556. @par
  557. The FFT functions operate in-place. That is, the array holding the input data
  558. will also be used to hold the corresponding result. The input data is complex
  559. and contains <code>2*fftLen</code> interleaved values as shown below.
  560. <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
  561. The FFT result will be contained in the same array and the frequency domain
  562. values will have the same interleaving.
  563. @par Floating-point
  564. The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
  565. stages are performed along with a single radix-2 or radix-4 stage, as needed.
  566. The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
  567. a different twiddle factor table.
  568. @par
  569. The function uses the standard FFT definition and output values may grow by a
  570. factor of <code>fftLen</code> when computing the forward transform. The
  571. inverse transform includes a scale of <code>1/fftLen</code> as part of the
  572. calculation and this matches the textbook definition of the inverse FFT.
  573. @par
  574. For the MVE version, the new arm_cfft_init_f32 initialization function is
  575. <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
  576. needed FFTs.</b> Other FFT versions can continue to be initialized as
  577. explained below.
  578. @par
  579. For not MVE versions, pre-initialized data structures containing twiddle factors
  580. and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
  581. this header in your function and then pass one of the constant structures as
  582. an argument to arm_cfft_f32. For example:
  583. @par
  584. <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
  585. @par
  586. computes a 64-point inverse complex FFT including bit reversal.
  587. The data structures are treated as constant data and not modified during the
  588. calculation. The same data structure can be reused for multiple transforms
  589. including mixing forward and inverse transforms.
  590. @par
  591. Earlier releases of the library provided separate radix-2 and radix-4
  592. algorithms that operated on floating-point data. These functions are still
  593. provided but are deprecated. The older functions are slower and less general
  594. than the new functions.
  595. @par
  596. An example of initialization of the constants for the arm_cfft_f32 function follows:
  597. @code
  598. const static arm_cfft_instance_f32 *S;
  599. ...
  600. switch (length) {
  601. case 16:
  602. S = &arm_cfft_sR_f32_len16;
  603. break;
  604. case 32:
  605. S = &arm_cfft_sR_f32_len32;
  606. break;
  607. case 64:
  608. S = &arm_cfft_sR_f32_len64;
  609. break;
  610. case 128:
  611. S = &arm_cfft_sR_f32_len128;
  612. break;
  613. case 256:
  614. S = &arm_cfft_sR_f32_len256;
  615. break;
  616. case 512:
  617. S = &arm_cfft_sR_f32_len512;
  618. break;
  619. case 1024:
  620. S = &arm_cfft_sR_f32_len1024;
  621. break;
  622. case 2048:
  623. S = &arm_cfft_sR_f32_len2048;
  624. break;
  625. case 4096:
  626. S = &arm_cfft_sR_f32_len4096;
  627. break;
  628. }
  629. @endcode
  630. @par
  631. The new arm_cfft_init_f32 can also be used.
  632. @par Q15 and Q31
  633. The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
  634. stages are performed along with a single radix-2 stage, as needed.
  635. The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
  636. a different twiddle factor table.
  637. @par
  638. The function uses the standard FFT definition and output values may grow by a
  639. factor of <code>fftLen</code> when computing the forward transform. The
  640. inverse transform includes a scale of <code>1/fftLen</code> as part of the
  641. calculation and this matches the textbook definition of the inverse FFT.
  642. @par
  643. Pre-initialized data structures containing twiddle factors and bit reversal
  644. tables are provided and defined in <code>arm_const_structs.h</code>. Include
  645. this header in your function and then pass one of the constant structures as
  646. an argument to arm_cfft_q31. For example:
  647. @par
  648. <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
  649. @par
  650. computes a 64-point inverse complex FFT including bit reversal.
  651. The data structures are treated as constant data and not modified during the
  652. calculation. The same data structure can be reused for multiple transforms
  653. including mixing forward and inverse transforms.
  654. @par
  655. Earlier releases of the library provided separate radix-2 and radix-4
  656. algorithms that operated on floating-point data. These functions are still
  657. provided but are deprecated. The older functions are slower and less general
  658. than the new functions.
  659. @par
  660. An example of initialization of the constants for the arm_cfft_q31 function follows:
  661. @code
  662. const static arm_cfft_instance_q31 *S;
  663. ...
  664. switch (length) {
  665. case 16:
  666. S = &arm_cfft_sR_q31_len16;
  667. break;
  668. case 32:
  669. S = &arm_cfft_sR_q31_len32;
  670. break;
  671. case 64:
  672. S = &arm_cfft_sR_q31_len64;
  673. break;
  674. case 128:
  675. S = &arm_cfft_sR_q31_len128;
  676. break;
  677. case 256:
  678. S = &arm_cfft_sR_q31_len256;
  679. break;
  680. case 512:
  681. S = &arm_cfft_sR_q31_len512;
  682. break;
  683. case 1024:
  684. S = &arm_cfft_sR_q31_len1024;
  685. break;
  686. case 2048:
  687. S = &arm_cfft_sR_q31_len2048;
  688. break;
  689. case 4096:
  690. S = &arm_cfft_sR_q31_len4096;
  691. break;
  692. }
  693. @endcode
  694. */
  695. /**
  696. @addtogroup ComplexFFT
  697. @{
  698. */
  699. /**
  700. @brief Processing function for the floating-point complex FFT.
  701. @param[in] S points to an instance of the floating-point CFFT structure
  702. @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
  703. @param[in] ifftFlag flag that selects transform direction
  704. - value = 0: forward transform
  705. - value = 1: inverse transform
  706. @param[in] bitReverseFlag flag that enables / disables bit reversal of output
  707. - value = 0: disables bit reversal of output
  708. - value = 1: enables bit reversal of output
  709. @return none
  710. */
  711. void arm_cfft_f16(
  712. const arm_cfft_instance_f16 * S,
  713. float16_t * p1,
  714. uint8_t ifftFlag,
  715. uint8_t bitReverseFlag)
  716. {
  717. uint32_t L = S->fftLen, l;
  718. float16_t invL, * pSrc;
  719. if (ifftFlag == 1U)
  720. {
  721. /* Conjugate input data */
  722. pSrc = p1 + 1;
  723. for(l=0; l<L; l++)
  724. {
  725. *pSrc = -*pSrc;
  726. pSrc += 2;
  727. }
  728. }
  729. switch (L)
  730. {
  731. case 16:
  732. case 64:
  733. case 256:
  734. case 1024:
  735. case 4096:
  736. arm_radix4_butterfly_f16 (p1, L, (float16_t*)S->pTwiddle, 1U);
  737. break;
  738. case 32:
  739. case 128:
  740. case 512:
  741. case 2048:
  742. arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
  743. break;
  744. }
  745. if ( bitReverseFlag )
  746. arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
  747. if (ifftFlag == 1U)
  748. {
  749. invL = 1.0f/(float16_t)L;
  750. /* Conjugate and scale output data */
  751. pSrc = p1;
  752. for(l=0; l<L; l++)
  753. {
  754. *pSrc++ *= invL ;
  755. *pSrc = -(*pSrc) * invL;
  756. pSrc++;
  757. }
  758. }
  759. }
  760. #endif /* if defined(ARM_FLOAT16_SUPPORTED) */
  761. #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
  762. /**
  763. @} end of ComplexFFT group
  764. */