arm_cfft_q31.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876
  1. /* ----------------------------------------------------------------------
  2. * Project: CMSIS DSP Library
  3. * Title: arm_cfft_q31.c
  4. * Description: Combined Radix Decimation in Frequency CFFT fixed point processing function
  5. *
  6. * $Date: 18. March 2019
  7. * $Revision: V1.6.0
  8. *
  9. * Target Processor: Cortex-M cores
  10. * -------------------------------------------------------------------- */
  11. /*
  12. * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
  13. *
  14. * SPDX-License-Identifier: Apache-2.0
  15. *
  16. * Licensed under the Apache License, Version 2.0 (the License); you may
  17. * not use this file except in compliance with the License.
  18. * You may obtain a copy of the License at
  19. *
  20. * www.apache.org/licenses/LICENSE-2.0
  21. *
  22. * Unless required by applicable law or agreed to in writing, software
  23. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  24. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  25. * See the License for the specific language governing permissions and
  26. * limitations under the License.
  27. */
  28. #include "arm_math.h"
  29. #if defined(ARM_MATH_MVEI)
  30. #include "arm_vec_fft.h"
  31. static void arm_bitreversal_32_inpl_mve(
  32. uint32_t *pSrc,
  33. const uint16_t bitRevLen,
  34. const uint16_t *pBitRevTab)
  35. {
  36. uint64_t *src = (uint64_t *) pSrc;
  37. uint32_t blkCnt; /* loop counters */
  38. uint32x4_t bitRevTabOff;
  39. uint32x4_t one = vdupq_n_u32(1);
  40. blkCnt = (bitRevLen / 2) / 2;
  41. while (blkCnt > 0U) {
  42. bitRevTabOff = vldrhq_u32(pBitRevTab);
  43. pBitRevTab += 4;
  44. uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
  45. uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
  46. uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
  47. uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
  48. vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
  49. vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
  50. /*
  51. * Decrement the blockSize loop counter
  52. */
  53. blkCnt--;
  54. }
  55. }
  56. static void _arm_radix4_butterfly_q31_mve(
  57. const arm_cfft_instance_q31 * S,
  58. q31_t *pSrc,
  59. uint32_t fftLen)
  60. {
  61. q31x4_t vecTmp0, vecTmp1;
  62. q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
  63. q31x4_t vecA, vecB, vecC, vecD;
  64. q31x4_t vecW;
  65. uint32_t blkCnt;
  66. uint32_t n1, n2;
  67. uint32_t stage = 0;
  68. int32_t iter = 1;
  69. static const uint32_t strides[4] = {
  70. (0 - 16) * sizeof(q31_t *), (1 - 16) * sizeof(q31_t *),
  71. (8 - 16) * sizeof(q31_t *), (9 - 16) * sizeof(q31_t *)
  72. };
  73. /*
  74. * Process first stages
  75. * Each stage in middle stages provides two down scaling of the input
  76. */
  77. n2 = fftLen;
  78. n1 = n2;
  79. n2 >>= 2u;
  80. for (int k = fftLen / 4u; k > 1; k >>= 2u)
  81. {
  82. for (int i = 0; i < iter; i++)
  83. {
  84. q31_t const *p_rearranged_twiddle_tab_stride2 =
  85. &S->rearranged_twiddle_stride2[
  86. S->rearranged_twiddle_tab_stride2_arr[stage]];
  87. q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
  88. S->rearranged_twiddle_tab_stride3_arr[stage]];
  89. q31_t const *p_rearranged_twiddle_tab_stride1 =
  90. &S->rearranged_twiddle_stride1[
  91. S->rearranged_twiddle_tab_stride1_arr[stage]];
  92. q31_t const *pW1, *pW2, *pW3;
  93. q31_t *inA = pSrc + CMPLX_DIM * i * n1;
  94. q31_t *inB = inA + n2 * CMPLX_DIM;
  95. q31_t *inC = inB + n2 * CMPLX_DIM;
  96. q31_t *inD = inC + n2 * CMPLX_DIM;
  97. pW1 = p_rearranged_twiddle_tab_stride1;
  98. pW2 = p_rearranged_twiddle_tab_stride2;
  99. pW3 = p_rearranged_twiddle_tab_stride3;
  100. blkCnt = n2 / 2;
  101. /*
  102. * load 2 x q31 complex pair
  103. */
  104. vecA = vldrwq_s32(inA);
  105. vecC = vldrwq_s32(inC);
  106. while (blkCnt > 0U)
  107. {
  108. vecB = vldrwq_s32(inB);
  109. vecD = vldrwq_s32(inD);
  110. vecSum0 = vhaddq(vecA, vecC);
  111. vecDiff0 = vhsubq(vecA, vecC);
  112. vecSum1 = vhaddq(vecB, vecD);
  113. vecDiff1 = vhsubq(vecB, vecD);
  114. /*
  115. * [ 1 1 1 1 ] * [ A B C D ]' .* 1
  116. */
  117. vecTmp0 = vhaddq(vecSum0, vecSum1);
  118. vst1q(inA, vecTmp0);
  119. inA += 4;
  120. /*
  121. * [ 1 -1 1 -1 ] * [ A B C D ]'
  122. */
  123. vecTmp0 = vhsubq(vecSum0, vecSum1);
  124. /*
  125. * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
  126. */
  127. vecW = vld1q(pW2);
  128. pW2 += 4;
  129. vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
  130. vst1q(inB, vecTmp1);
  131. inB += 4;
  132. /*
  133. * [ 1 -i -1 +i ] * [ A B C D ]'
  134. */
  135. vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
  136. /*
  137. * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
  138. */
  139. vecW = vld1q(pW1);
  140. pW1 += 4;
  141. vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
  142. vst1q(inC, vecTmp1);
  143. inC += 4;
  144. /*
  145. * [ 1 +i -1 -i ] * [ A B C D ]'
  146. */
  147. vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
  148. /*
  149. * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
  150. */
  151. vecW = vld1q(pW3);
  152. pW3 += 4;
  153. vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
  154. vst1q(inD, vecTmp1);
  155. inD += 4;
  156. vecA = vldrwq_s32(inA);
  157. vecC = vldrwq_s32(inC);
  158. blkCnt--;
  159. }
  160. }
  161. n1 = n2;
  162. n2 >>= 2u;
  163. iter = iter << 2;
  164. stage++;
  165. }
  166. /*
  167. * End of 1st stages process
  168. * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
  169. * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
  170. * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
  171. * data is in 5.27(q27) format for the 16 point as there are no middle stages
  172. */
  173. /*
  174. * start of Last stage process
  175. */
  176. uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
  177. vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
  178. /*
  179. * load scheduling
  180. */
  181. vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
  182. vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
  183. blkCnt = (fftLen >> 3);
  184. while (blkCnt > 0U)
  185. {
  186. vecSum0 = vhaddq(vecA, vecC);
  187. vecDiff0 = vhsubq(vecA, vecC);
  188. vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
  189. vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
  190. vecSum1 = vhaddq(vecB, vecD);
  191. vecDiff1 = vhsubq(vecB, vecD);
  192. /*
  193. * pre-load for next iteration
  194. */
  195. vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
  196. vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
  197. vecTmp0 = vhaddq(vecSum0, vecSum1);
  198. vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
  199. vecTmp0 = vhsubq(vecSum0, vecSum1);
  200. vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
  201. vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
  202. vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
  203. vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
  204. vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
  205. blkCnt--;
  206. }
  207. /*
  208. * output is in 11.21(q21) format for the 1024 point
  209. * output is in 9.23(q23) format for the 256 point
  210. * output is in 7.25(q25) format for the 64 point
  211. * output is in 5.27(q27) format for the 16 point
  212. */
  213. }
  214. static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
  215. {
  216. uint32_t n2;
  217. q31_t *pIn0;
  218. q31_t *pIn1;
  219. const q31_t *pCoef = S->pTwiddle;
  220. uint32_t blkCnt;
  221. q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
  222. q31x4_t vecCmplxTmp, vecTw;
  223. n2 = fftLen >> 1;
  224. pIn0 = pSrc;
  225. pIn1 = pSrc + fftLen;
  226. blkCnt = n2 / 2;
  227. while (blkCnt > 0U)
  228. {
  229. vecIn0 = vld1q_s32(pIn0);
  230. vecIn1 = vld1q_s32(pIn1);
  231. vecIn0 = vecIn0 >> 1;
  232. vecIn1 = vecIn1 >> 1;
  233. vecSum = vhaddq(vecIn0, vecIn1);
  234. vst1q(pIn0, vecSum);
  235. pIn0 += 4;
  236. vecTw = vld1q_s32(pCoef);
  237. pCoef += 4;
  238. vecDiff = vhsubq(vecIn0, vecIn1);
  239. vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw);
  240. vst1q(pIn1, vecCmplxTmp);
  241. pIn1 += 4;
  242. blkCnt--;
  243. }
  244. _arm_radix4_butterfly_q31_mve(S, pSrc, n2);
  245. _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2);
  246. pIn0 = pSrc;
  247. blkCnt = (fftLen << 1) >> 2;
  248. while (blkCnt > 0U)
  249. {
  250. vecIn0 = vld1q_s32(pIn0);
  251. vecIn0 = vecIn0 << 1;
  252. vst1q(pIn0, vecIn0);
  253. pIn0 += 4;
  254. blkCnt--;
  255. }
  256. /*
  257. * tail
  258. * (will be merged thru tail predication)
  259. */
  260. blkCnt = (fftLen << 1) & 3;
  261. if (blkCnt > 0U)
  262. {
  263. mve_pred16_t p0 = vctp32q(blkCnt);
  264. vecIn0 = vld1q_s32(pIn0);
  265. vecIn0 = vecIn0 << 1;
  266. vstrwq_p(pIn0, vecIn0, p0);
  267. }
  268. }
  269. static void _arm_radix4_butterfly_inverse_q31_mve(
  270. const arm_cfft_instance_q31 *S,
  271. q31_t *pSrc,
  272. uint32_t fftLen)
  273. {
  274. q31x4_t vecTmp0, vecTmp1;
  275. q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
  276. q31x4_t vecA, vecB, vecC, vecD;
  277. q31x4_t vecW;
  278. uint32_t blkCnt;
  279. uint32_t n1, n2;
  280. uint32_t stage = 0;
  281. int32_t iter = 1;
  282. static const uint32_t strides[4] = {
  283. (0 - 16) * sizeof(q31_t *), (1 - 16) * sizeof(q31_t *),
  284. (8 - 16) * sizeof(q31_t *), (9 - 16) * sizeof(q31_t *)
  285. };
  286. /*
  287. * Process first stages
  288. * Each stage in middle stages provides two down scaling of the input
  289. */
  290. n2 = fftLen;
  291. n1 = n2;
  292. n2 >>= 2u;
  293. for (int k = fftLen / 4u; k > 1; k >>= 2u)
  294. {
  295. for (int i = 0; i < iter; i++)
  296. {
  297. q31_t const *p_rearranged_twiddle_tab_stride2 =
  298. &S->rearranged_twiddle_stride2[
  299. S->rearranged_twiddle_tab_stride2_arr[stage]];
  300. q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
  301. S->rearranged_twiddle_tab_stride3_arr[stage]];
  302. q31_t const *p_rearranged_twiddle_tab_stride1 =
  303. &S->rearranged_twiddle_stride1[
  304. S->rearranged_twiddle_tab_stride1_arr[stage]];
  305. q31_t const *pW1, *pW2, *pW3;
  306. q31_t *inA = pSrc + CMPLX_DIM * i * n1;
  307. q31_t *inB = inA + n2 * CMPLX_DIM;
  308. q31_t *inC = inB + n2 * CMPLX_DIM;
  309. q31_t *inD = inC + n2 * CMPLX_DIM;
  310. pW1 = p_rearranged_twiddle_tab_stride1;
  311. pW2 = p_rearranged_twiddle_tab_stride2;
  312. pW3 = p_rearranged_twiddle_tab_stride3;
  313. blkCnt = n2 / 2;
  314. /*
  315. * load 2 x q31 complex pair
  316. */
  317. vecA = vldrwq_s32(inA);
  318. vecC = vldrwq_s32(inC);
  319. while (blkCnt > 0U)
  320. {
  321. vecB = vldrwq_s32(inB);
  322. vecD = vldrwq_s32(inD);
  323. vecSum0 = vhaddq(vecA, vecC);
  324. vecDiff0 = vhsubq(vecA, vecC);
  325. vecSum1 = vhaddq(vecB, vecD);
  326. vecDiff1 = vhsubq(vecB, vecD);
  327. /*
  328. * [ 1 1 1 1 ] * [ A B C D ]' .* 1
  329. */
  330. vecTmp0 = vhaddq(vecSum0, vecSum1);
  331. vst1q(inA, vecTmp0);
  332. inA += 4;
  333. /*
  334. * [ 1 -1 1 -1 ] * [ A B C D ]'
  335. */
  336. vecTmp0 = vhsubq(vecSum0, vecSum1);
  337. /*
  338. * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
  339. */
  340. vecW = vld1q(pW2);
  341. pW2 += 4;
  342. vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
  343. vst1q(inB, vecTmp1);
  344. inB += 4;
  345. /*
  346. * [ 1 -i -1 +i ] * [ A B C D ]'
  347. */
  348. vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
  349. /*
  350. * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
  351. */
  352. vecW = vld1q(pW1);
  353. pW1 += 4;
  354. vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
  355. vst1q(inC, vecTmp1);
  356. inC += 4;
  357. /*
  358. * [ 1 +i -1 -i ] * [ A B C D ]'
  359. */
  360. vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
  361. /*
  362. * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
  363. */
  364. vecW = vld1q(pW3);
  365. pW3 += 4;
  366. vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
  367. vst1q(inD, vecTmp1);
  368. inD += 4;
  369. vecA = vldrwq_s32(inA);
  370. vecC = vldrwq_s32(inC);
  371. blkCnt--;
  372. }
  373. }
  374. n1 = n2;
  375. n2 >>= 2u;
  376. iter = iter << 2;
  377. stage++;
  378. }
  379. /*
  380. * End of 1st stages process
  381. * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
  382. * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
  383. * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
  384. * data is in 5.27(q27) format for the 16 point as there are no middle stages
  385. */
  386. /*
  387. * start of Last stage process
  388. */
  389. uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
  390. vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
  391. /*
  392. * load scheduling
  393. */
  394. vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
  395. vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
  396. blkCnt = (fftLen >> 3);
  397. while (blkCnt > 0U)
  398. {
  399. vecSum0 = vhaddq(vecA, vecC);
  400. vecDiff0 = vhsubq(vecA, vecC);
  401. vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
  402. vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
  403. vecSum1 = vhaddq(vecB, vecD);
  404. vecDiff1 = vhsubq(vecB, vecD);
  405. /*
  406. * pre-load for next iteration
  407. */
  408. vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
  409. vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
  410. vecTmp0 = vhaddq(vecSum0, vecSum1);
  411. vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
  412. vecTmp0 = vhsubq(vecSum0, vecSum1);
  413. vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
  414. vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
  415. vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
  416. vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
  417. vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
  418. blkCnt--;
  419. }
  420. /*
  421. * output is in 11.21(q21) format for the 1024 point
  422. * output is in 9.23(q23) format for the 256 point
  423. * output is in 7.25(q25) format for the 64 point
  424. * output is in 5.27(q27) format for the 16 point
  425. */
  426. }
  427. static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
  428. {
  429. uint32_t n2;
  430. q31_t *pIn0;
  431. q31_t *pIn1;
  432. const q31_t *pCoef = S->pTwiddle;
  433. //uint16_t twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen);
  434. //q31_t twidIncr = (2 * twidCoefModifier * sizeof(q31_t));
  435. uint32_t blkCnt;
  436. //uint64x2_t vecOffs;
  437. q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
  438. q31x4_t vecCmplxTmp, vecTw;
  439. n2 = fftLen >> 1;
  440. pIn0 = pSrc;
  441. pIn1 = pSrc + fftLen;
  442. //vecOffs[0] = 0;
  443. //vecOffs[1] = (uint64_t) twidIncr;
  444. blkCnt = n2 / 2;
  445. while (blkCnt > 0U)
  446. {
  447. vecIn0 = vld1q_s32(pIn0);
  448. vecIn1 = vld1q_s32(pIn1);
  449. vecIn0 = vecIn0 >> 1;
  450. vecIn1 = vecIn1 >> 1;
  451. vecSum = vhaddq(vecIn0, vecIn1);
  452. vst1q(pIn0, vecSum);
  453. pIn0 += 4;
  454. //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs);
  455. vecTw = vld1q_s32(pCoef);
  456. pCoef += 4;
  457. vecDiff = vhsubq(vecIn0, vecIn1);
  458. vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw);
  459. vst1q(pIn1, vecCmplxTmp);
  460. pIn1 += 4;
  461. //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr);
  462. blkCnt--;
  463. }
  464. _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2);
  465. _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2);
  466. pIn0 = pSrc;
  467. blkCnt = (fftLen << 1) >> 2;
  468. while (blkCnt > 0U)
  469. {
  470. vecIn0 = vld1q_s32(pIn0);
  471. vecIn0 = vecIn0 << 1;
  472. vst1q(pIn0, vecIn0);
  473. pIn0 += 4;
  474. blkCnt--;
  475. }
  476. /*
  477. * tail
  478. * (will be merged thru tail predication)
  479. */
  480. blkCnt = (fftLen << 1) & 3;
  481. if (blkCnt > 0U)
  482. {
  483. mve_pred16_t p0 = vctp32q(blkCnt);
  484. vecIn0 = vld1q_s32(pIn0);
  485. vecIn0 = vecIn0 << 1;
  486. vstrwq_p(pIn0, vecIn0, p0);
  487. }
  488. }
  489. /**
  490. @ingroup groupTransforms
  491. */
  492. /**
  493. @addtogroup ComplexFFT
  494. @{
  495. */
  496. /**
  497. @brief Processing function for the Q31 complex FFT.
  498. @param[in] S points to an instance of the fixed-point CFFT structure
  499. @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
  500. @param[in] ifftFlag flag that selects transform direction
  501. - value = 0: forward transform
  502. - value = 1: inverse transform
  503. @param[in] bitReverseFlag flag that enables / disables bit reversal of output
  504. - value = 0: disables bit reversal of output
  505. - value = 1: enables bit reversal of output
  506. @return none
  507. */
  508. void arm_cfft_q31(
  509. const arm_cfft_instance_q31 * S,
  510. q31_t * pSrc,
  511. uint8_t ifftFlag,
  512. uint8_t bitReverseFlag)
  513. {
  514. uint32_t fftLen = S->fftLen;
  515. if (ifftFlag == 1U) {
  516. switch (fftLen) {
  517. case 16:
  518. case 64:
  519. case 256:
  520. case 1024:
  521. case 4096:
  522. _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
  523. break;
  524. case 32:
  525. case 128:
  526. case 512:
  527. case 2048:
  528. arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
  529. break;
  530. }
  531. } else {
  532. switch (fftLen) {
  533. case 16:
  534. case 64:
  535. case 256:
  536. case 1024:
  537. case 4096:
  538. _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
  539. break;
  540. case 32:
  541. case 128:
  542. case 512:
  543. case 2048:
  544. arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
  545. break;
  546. }
  547. }
  548. if (bitReverseFlag)
  549. {
  550. arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
  551. }
  552. }
  553. #else
  554. extern void arm_radix4_butterfly_q31(
  555. q31_t * pSrc,
  556. uint32_t fftLen,
  557. const q31_t * pCoef,
  558. uint32_t twidCoefModifier);
  559. extern void arm_radix4_butterfly_inverse_q31(
  560. q31_t * pSrc,
  561. uint32_t fftLen,
  562. const q31_t * pCoef,
  563. uint32_t twidCoefModifier);
  564. extern void arm_bitreversal_32(
  565. uint32_t * pSrc,
  566. const uint16_t bitRevLen,
  567. const uint16_t * pBitRevTable);
  568. void arm_cfft_radix4by2_q31(
  569. q31_t * pSrc,
  570. uint32_t fftLen,
  571. const q31_t * pCoef);
  572. void arm_cfft_radix4by2_inverse_q31(
  573. q31_t * pSrc,
  574. uint32_t fftLen,
  575. const q31_t * pCoef);
  576. /**
  577. @ingroup groupTransforms
  578. */
  579. /**
  580. @addtogroup ComplexFFT
  581. @{
  582. */
  583. /**
  584. @brief Processing function for the Q31 complex FFT.
  585. @param[in] S points to an instance of the fixed-point CFFT structure
  586. @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
  587. @param[in] ifftFlag flag that selects transform direction
  588. - value = 0: forward transform
  589. - value = 1: inverse transform
  590. @param[in] bitReverseFlag flag that enables / disables bit reversal of output
  591. - value = 0: disables bit reversal of output
  592. - value = 1: enables bit reversal of output
  593. @return none
  594. */
  595. void arm_cfft_q31(
  596. const arm_cfft_instance_q31 * S,
  597. q31_t * p1,
  598. uint8_t ifftFlag,
  599. uint8_t bitReverseFlag)
  600. {
  601. uint32_t L = S->fftLen;
  602. if (ifftFlag == 1U)
  603. {
  604. switch (L)
  605. {
  606. case 16:
  607. case 64:
  608. case 256:
  609. case 1024:
  610. case 4096:
  611. arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
  612. break;
  613. case 32:
  614. case 128:
  615. case 512:
  616. case 2048:
  617. arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
  618. break;
  619. }
  620. }
  621. else
  622. {
  623. switch (L)
  624. {
  625. case 16:
  626. case 64:
  627. case 256:
  628. case 1024:
  629. case 4096:
  630. arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
  631. break;
  632. case 32:
  633. case 128:
  634. case 512:
  635. case 2048:
  636. arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
  637. break;
  638. }
  639. }
  640. if ( bitReverseFlag )
  641. arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
  642. }
  643. /**
  644. @} end of ComplexFFT group
  645. */
  646. void arm_cfft_radix4by2_q31(
  647. q31_t * pSrc,
  648. uint32_t fftLen,
  649. const q31_t * pCoef)
  650. {
  651. uint32_t i, l;
  652. uint32_t n2;
  653. q31_t xt, yt, cosVal, sinVal;
  654. q31_t p0, p1;
  655. n2 = fftLen >> 1U;
  656. for (i = 0; i < n2; i++)
  657. {
  658. cosVal = pCoef[2 * i];
  659. sinVal = pCoef[2 * i + 1];
  660. l = i + n2;
  661. xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
  662. pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
  663. yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
  664. pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
  665. mult_32x32_keep32_R(p0, xt, cosVal);
  666. mult_32x32_keep32_R(p1, yt, cosVal);
  667. multAcc_32x32_keep32_R(p0, yt, sinVal);
  668. multSub_32x32_keep32_R(p1, xt, sinVal);
  669. pSrc[2 * l] = p0 << 1;
  670. pSrc[2 * l + 1] = p1 << 1;
  671. }
  672. /* first col */
  673. arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U);
  674. /* second col */
  675. arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
  676. n2 = fftLen >> 1U;
  677. for (i = 0; i < n2; i++)
  678. {
  679. p0 = pSrc[4 * i + 0];
  680. p1 = pSrc[4 * i + 1];
  681. xt = pSrc[4 * i + 2];
  682. yt = pSrc[4 * i + 3];
  683. p0 <<= 1U;
  684. p1 <<= 1U;
  685. xt <<= 1U;
  686. yt <<= 1U;
  687. pSrc[4 * i + 0] = p0;
  688. pSrc[4 * i + 1] = p1;
  689. pSrc[4 * i + 2] = xt;
  690. pSrc[4 * i + 3] = yt;
  691. }
  692. }
  693. void arm_cfft_radix4by2_inverse_q31(
  694. q31_t * pSrc,
  695. uint32_t fftLen,
  696. const q31_t * pCoef)
  697. {
  698. uint32_t i, l;
  699. uint32_t n2;
  700. q31_t xt, yt, cosVal, sinVal;
  701. q31_t p0, p1;
  702. n2 = fftLen >> 1U;
  703. for (i = 0; i < n2; i++)
  704. {
  705. cosVal = pCoef[2 * i];
  706. sinVal = pCoef[2 * i + 1];
  707. l = i + n2;
  708. xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
  709. pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
  710. yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
  711. pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
  712. mult_32x32_keep32_R(p0, xt, cosVal);
  713. mult_32x32_keep32_R(p1, yt, cosVal);
  714. multSub_32x32_keep32_R(p0, yt, sinVal);
  715. multAcc_32x32_keep32_R(p1, xt, sinVal);
  716. pSrc[2 * l] = p0 << 1U;
  717. pSrc[2 * l + 1] = p1 << 1U;
  718. }
  719. /* first col */
  720. arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
  721. /* second col */
  722. arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
  723. n2 = fftLen >> 1U;
  724. for (i = 0; i < n2; i++)
  725. {
  726. p0 = pSrc[4 * i + 0];
  727. p1 = pSrc[4 * i + 1];
  728. xt = pSrc[4 * i + 2];
  729. yt = pSrc[4 * i + 3];
  730. p0 <<= 1U;
  731. p1 <<= 1U;
  732. xt <<= 1U;
  733. yt <<= 1U;
  734. pSrc[4 * i + 0] = p0;
  735. pSrc[4 * i + 1] = p1;
  736. pSrc[4 * i + 2] = xt;
  737. pSrc[4 * i + 3] = yt;
  738. }
  739. }
  740. #endif /* defined(ARM_MATH_MVEI) */