| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521 |
- /* ----------------------------------------------------------------------
- * Project: CMSIS DSP Library
- * Title: arm_rfft_fast_f16.c
- * Description: RFFT & RIFFT Floating point process function
- *
- * $Date: 23 April 2021
- * $Revision: V1.9.0
- *
- * Target Processor: Cortex-M and Cortex-A cores
- * -------------------------------------------------------------------- */
- /*
- * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #include "dsp/transform_functions_f16.h"
- #include "arm_common_tables_f16.h"
- #if defined(ARM_FLOAT16_SUPPORTED)
- #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
- void stage_rfft_f16(
- const arm_rfft_fast_instance_f16 * S,
- float16_t * p,
- float16_t * pOut)
- {
- int32_t k; /* Loop Counter */
- float16_t twR, twI; /* RFFT Twiddle coefficients */
- const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
- float16_t *pA = p; /* increasing pointer */
- float16_t *pB = p; /* decreasing pointer */
- float16_t xAR, xAI, xBR, xBI; /* temporary variables */
- float16_t t1a, t1b; /* temporary variables */
- float16_t p0, p1, p2, p3; /* temporary variables */
- float16x8x2_t tw,xA,xB;
- float16x8x2_t tmp1, tmp2, res;
- uint16x8_t vecStridesBkwd;
- vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
- int blockCnt;
- k = (S->Sint).fftLen - 1;
- /* Pack first and last sample of the frequency domain together */
- xBR = pB[0];
- xBI = pB[1];
- xAR = pA[0];
- xAI = pA[1];
- twR = *pCoeff++ ;
- twI = *pCoeff++ ;
- // U1 = XA(1) + XB(1); % It is real
- t1a = (_Float16)xBR + (_Float16)xAR ;
- // U2 = XB(1) - XA(1); % It is imaginary
- t1b = (_Float16)xBI + (_Float16)xAI ;
- // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
- // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
- *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
- *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
- // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
- pB = p + 2*k - 14;
- pA += 2;
- blockCnt = k >> 3;
- while (blockCnt > 0)
- {
- /*
- function X = my_split_rfft(X, ifftFlag)
- % X is a series of real numbers
- L = length(X);
- XC = X(1:2:end) +i*X(2:2:end);
- XA = fft(XC);
- XB = conj(XA([1 end:-1:2]));
- TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
- for l = 2:L/2
- XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
- end
- XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
- X = XA;
- */
- xA = vld2q_f16(pA);
- pA += 16;
- xB = vld2q_f16(pB);
- xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
- xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
- xB.val[1] = vnegq_f16(xB.val[1]);
- pB -= 16;
- tw = vld2q_f16(pCoeff);
- pCoeff += 16;
- tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
- tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
- tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
- tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
- res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
- res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
- res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
- res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
- res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
- res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
- res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
- res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
- vst2q_f16(pOut, res);
- pOut += 16;
-
- blockCnt--;
- }
- pB += 14;
- blockCnt = k & 7;
- while (blockCnt > 0)
- {
- /*
- function X = my_split_rfft(X, ifftFlag)
- % X is a series of real numbers
- L = length(X);
- XC = X(1:2:end) +i*X(2:2:end);
- XA = fft(XC);
- XB = conj(XA([1 end:-1:2]));
- TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
- for l = 2:L/2
- XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
- end
- XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
- X = XA;
- */
- xBI = pB[1];
- xBR = pB[0];
- xAR = pA[0];
- xAI = pA[1];
- twR = *pCoeff++;
- twI = *pCoeff++;
- t1a = (_Float16)xBR - (_Float16)xAR ;
- t1b = (_Float16)xBI + (_Float16)xAI ;
- // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
- // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
- p0 = (_Float16)twR * (_Float16)t1a;
- p1 = (_Float16)twI * (_Float16)t1a;
- p2 = (_Float16)twR * (_Float16)t1b;
- p3 = (_Float16)twI * (_Float16)t1b;
- *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
- *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
- pA += 2;
- pB -= 2;
- blockCnt--;
- }
- }
- /* Prepares data for inverse cfft */
- void merge_rfft_f16(
- const arm_rfft_fast_instance_f16 * S,
- float16_t * p,
- float16_t * pOut)
- {
- int32_t k; /* Loop Counter */
- float16_t twR, twI; /* RFFT Twiddle coefficients */
- const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
- float16_t *pA = p; /* increasing pointer */
- float16_t *pB = p; /* decreasing pointer */
- float16_t xAR, xAI, xBR, xBI; /* temporary variables */
- float16_t t1a, t1b, r, s, t, u; /* temporary variables */
- float16x8x2_t tw,xA,xB;
- float16x8x2_t tmp1, tmp2, res;
- uint16x8_t vecStridesBkwd;
- vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
- int blockCnt;
-
- k = (S->Sint).fftLen - 1;
- xAR = pA[0];
- xAI = pA[1];
- pCoeff += 2 ;
- *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
- *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
- pB = p + 2*k - 14;
- pA += 2 ;
- blockCnt = k >> 3;
- while (blockCnt > 0)
- {
- /* G is half of the frequency complex spectrum */
- //for k = 2:N
- // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
- xA = vld2q_f16(pA);
- pA += 16;
- xB = vld2q_f16(pB);
- xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
- xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
- xB.val[1] = vnegq_f16(xB.val[1]);
- pB -= 16;
- tw = vld2q_f16(pCoeff);
- tw.val[1] = vnegq_f16(tw.val[1]);
- pCoeff += 16;
- tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
- tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
- tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
- tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
- res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
- res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
- res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
- res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
- res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
- res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
- res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
- res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
- vst2q_f16(pOut, res);
- pOut += 16;
-
- blockCnt--;
- }
- pB += 14;
- blockCnt = k & 7;
- while (blockCnt > 0)
- {
- /* G is half of the frequency complex spectrum */
- //for k = 2:N
- // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
- xBI = pB[1] ;
- xBR = pB[0] ;
- xAR = pA[0];
- xAI = pA[1];
- twR = *pCoeff++;
- twI = *pCoeff++;
- t1a = (_Float16)xAR - (_Float16)xBR ;
- t1b = (_Float16)xAI + (_Float16)xBI ;
- r = (_Float16)twR * (_Float16)t1a;
- s = (_Float16)twI * (_Float16)t1b;
- t = (_Float16)twI * (_Float16)t1a;
- u = (_Float16)twR * (_Float16)t1b;
- // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
- // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
- *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
- *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
- pA += 2;
- pB -= 2;
- blockCnt--;
- }
- }
- #else
- void stage_rfft_f16(
- const arm_rfft_fast_instance_f16 * S,
- float16_t * p,
- float16_t * pOut)
- {
- int32_t k; /* Loop Counter */
- float16_t twR, twI; /* RFFT Twiddle coefficients */
- const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
- float16_t *pA = p; /* increasing pointer */
- float16_t *pB = p; /* decreasing pointer */
- float16_t xAR, xAI, xBR, xBI; /* temporary variables */
- float16_t t1a, t1b; /* temporary variables */
- float16_t p0, p1, p2, p3; /* temporary variables */
- k = (S->Sint).fftLen - 1;
- /* Pack first and last sample of the frequency domain together */
- xBR = pB[0];
- xBI = pB[1];
- xAR = pA[0];
- xAI = pA[1];
- twR = *pCoeff++ ;
- twI = *pCoeff++ ;
- // U1 = XA(1) + XB(1); % It is real
- t1a = (_Float16)xBR + (_Float16)xAR ;
- // U2 = XB(1) - XA(1); % It is imaginary
- t1b = (_Float16)xBI + (_Float16)xAI ;
- // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
- // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
- *pOut++ = 0.5f16 * ( (_Float16)t1a + (_Float16)t1b );
- *pOut++ = 0.5f16 * ( (_Float16)t1a - (_Float16)t1b );
- // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
- pB = p + 2*k;
- pA += 2;
- do
- {
- /*
- function X = my_split_rfft(X, ifftFlag)
- % X is a series of real numbers
- L = length(X);
- XC = X(1:2:end) +i*X(2:2:end);
- XA = fft(XC);
- XB = conj(XA([1 end:-1:2]));
- TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
- for l = 2:L/2
- XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
- end
- XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
- X = XA;
- */
- xBI = pB[1];
- xBR = pB[0];
- xAR = pA[0];
- xAI = pA[1];
- twR = *pCoeff++;
- twI = *pCoeff++;
- t1a = (_Float16)xBR - (_Float16)xAR ;
- t1b = (_Float16)xBI + (_Float16)xAI ;
- // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
- // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
- p0 = (_Float16)twR * (_Float16)t1a;
- p1 = (_Float16)twI * (_Float16)t1a;
- p2 = (_Float16)twR * (_Float16)t1b;
- p3 = (_Float16)twI * (_Float16)t1b;
- *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR + (_Float16)p0 + (_Float16)p3 ); //xAR
- *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)p1 - (_Float16)p2 ); //xAI
- pA += 2;
- pB -= 2;
- k--;
- } while (k > 0);
- }
- /* Prepares data for inverse cfft */
- void merge_rfft_f16(
- const arm_rfft_fast_instance_f16 * S,
- float16_t * p,
- float16_t * pOut)
- {
- int32_t k; /* Loop Counter */
- float16_t twR, twI; /* RFFT Twiddle coefficients */
- const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
- float16_t *pA = p; /* increasing pointer */
- float16_t *pB = p; /* decreasing pointer */
- float16_t xAR, xAI, xBR, xBI; /* temporary variables */
- float16_t t1a, t1b, r, s, t, u; /* temporary variables */
- k = (S->Sint).fftLen - 1;
- xAR = pA[0];
- xAI = pA[1];
- pCoeff += 2 ;
- *pOut++ = 0.5f16 * ( (_Float16)xAR + (_Float16)xAI );
- *pOut++ = 0.5f16 * ( (_Float16)xAR - (_Float16)xAI );
- pB = p + 2*k ;
- pA += 2 ;
- while (k > 0)
- {
- /* G is half of the frequency complex spectrum */
- //for k = 2:N
- // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
- xBI = pB[1] ;
- xBR = pB[0] ;
- xAR = pA[0];
- xAI = pA[1];
- twR = *pCoeff++;
- twI = *pCoeff++;
- t1a = (_Float16)xAR - (_Float16)xBR ;
- t1b = (_Float16)xAI + (_Float16)xBI ;
- r = (_Float16)twR * (_Float16)t1a;
- s = (_Float16)twI * (_Float16)t1b;
- t = (_Float16)twI * (_Float16)t1a;
- u = (_Float16)twR * (_Float16)t1b;
- // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
- // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
- *pOut++ = 0.5f16 * ((_Float16)xAR + (_Float16)xBR - (_Float16)r - (_Float16)s ); //xAR
- *pOut++ = 0.5f16 * ((_Float16)xAI - (_Float16)xBI + (_Float16)t - (_Float16)u ); //xAI
- pA += 2;
- pB -= 2;
- k--;
- }
- }
- #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
- /**
- @ingroup RealFFT
- */
- /**
- @defgroup RealFFTF16 Real FFT F16 Functions
- */
- /**
- @addtogroup RealFFTF16
- @{
- */
- /**
- @brief Processing function for the floating-point real FFT.
- @param[in] S points to an arm_rfft_fast_instance_f16 structure
- @param[in] p points to input buffer (Source buffer is modified by this function.)
- @param[in] pOut points to output buffer
- @param[in] ifftFlag
- - value = 0: RFFT
- - value = 1: RIFFT
- */
- void arm_rfft_fast_f16(
- const arm_rfft_fast_instance_f16 * S,
- float16_t * p,
- float16_t * pOut,
- uint8_t ifftFlag)
- {
- const arm_cfft_instance_f16 * Sint = &(S->Sint);
- /* Calculation of Real FFT */
- if (ifftFlag)
- {
- /* Real FFT compression */
- merge_rfft_f16(S, p, pOut);
- /* Complex radix-4 IFFT process */
- arm_cfft_f16( Sint, pOut, ifftFlag, 1);
- }
- else
- {
- /* Calculation of RFFT of input */
- arm_cfft_f16( Sint, p, ifftFlag, 1);
- /* Real FFT extraction */
- stage_rfft_f16(S, p, pOut);
- }
- }
- /**
- * @} end of RealFFTF16 group
- */
- #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
|