filter_test.cpp 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657
  1. extern "C" {
  2. extern void filter_test();
  3. }
  4. #include "allocator.h"
  5. #include <dsppp/arch.hpp>
  6. #include <dsppp/fixed_point.hpp>
  7. #include <dsppp/matrix.hpp>
  8. #include <dsppp/unroll.hpp>
  9. #include <iostream>
  10. #include <cmsis_tests.h>
  11. #if defined(ARM_MATH_MVEI) || defined(ARM_MATH_MVEF)
  12. #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
  13. #define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
  14. for (int j = 0; j < nbAcc; j++) { \
  15. const q15_t *pSmp = &pSample[j]; \
  16. q63_t acc[4]; \
  17. \
  18. acc[j] = 0; \
  19. for (int i = 0; i < nbVecTaps; i++) { \
  20. vecIn0 = vld1q(pSmp + 8 * i); \
  21. acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \
  22. } \
  23. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \
  24. }
  25. #define FIR_Q15_MAIN_CORE() \
  26. { \
  27. q15_t *pState = S->pState; /* State pointer */ \
  28. const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
  29. q15_t *pStateCur; /* Points to the current sample of the state */ \
  30. const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \
  31. q15_t *pOutput; /* Temporary pointer to the output buffer */ \
  32. const q15_t *pTempSrc; /* Temporary pointer to the source data */ \
  33. q15_t *pTempDest; /* Temporary pointer to the destination buffer */\
  34. uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
  35. int32_t blkCnt; \
  36. q15x8_t vecIn0; \
  37. \
  38. /* \
  39. * load coefs \
  40. */ \
  41. q15x8_t vecCoeffs[NBVECTAPS]; \
  42. \
  43. for (int i = 0; i < NBVECTAPS; i++) \
  44. vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \
  45. \
  46. /* \
  47. * pState points to state array which contains previous frame (numTaps - 1) samples \
  48. * pStateCur points to the location where the new input data should be written \
  49. */ \
  50. pStateCur = &(pState[(numTaps - 1u)]); \
  51. pTempSrc = pSrc; \
  52. pSamples = pState; \
  53. pOutput = pDst; \
  54. \
  55. blkCnt = blockSize >> 2; \
  56. while (blkCnt > 0) { \
  57. /* \
  58. * Save 4 input samples in the history buffer \
  59. */ \
  60. vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \
  61. pStateCur += 4; \
  62. pTempSrc += 4; \
  63. \
  64. FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \
  65. pSamples += 4; \
  66. \
  67. blkCnt--; \
  68. } \
  69. \
  70. /* tail */ \
  71. int32_t residual = blockSize & 3; \
  72. \
  73. for (int i = 0; i < residual; i++) \
  74. *pStateCur++ = *pTempSrc++; \
  75. \
  76. FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \
  77. \
  78. /* \
  79. * Copy the samples back into the history buffer start \
  80. */ \
  81. pTempSrc = &pState[blockSize]; \
  82. pTempDest = pState; \
  83. \
  84. /* current compiler limitation */ \
  85. blkCnt = (numTaps - 1) >> 3; \
  86. while (blkCnt > 0) \
  87. { \
  88. vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \
  89. pTempSrc += 8; \
  90. pTempDest += 8; \
  91. blkCnt--; \
  92. } \
  93. blkCnt = (numTaps - 1) & 7; \
  94. if (blkCnt > 0) \
  95. { \
  96. mve_pred16_t p = vctp16q(blkCnt); \
  97. vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \
  98. } \
  99. }
  100. static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
  101. const q15_t * __restrict pSrc,
  102. q15_t * __restrict pDst, uint32_t blockSize)
  103. {
  104. #define NBTAPS 32
  105. #define NBVECTAPS (NBTAPS / 8)
  106. FIR_Q15_MAIN_CORE();
  107. #undef NBVECTAPS
  108. #undef NBTAPS
  109. }
  110. static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
  111. const q15_t * __restrict pSrc,
  112. q15_t * __restrict pDst, uint32_t blockSize)
  113. {
  114. #define NBTAPS 24
  115. #define NBVECTAPS (NBTAPS / 8)
  116. FIR_Q15_MAIN_CORE();
  117. #undef NBVECTAPS
  118. #undef NBTAPS
  119. }
  120. static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
  121. const q15_t * __restrict pSrc,
  122. q15_t * __restrict pDst, uint32_t blockSize)
  123. {
  124. #define NBTAPS 16
  125. #define NBVECTAPS (NBTAPS / 8)
  126. FIR_Q15_MAIN_CORE();
  127. #undef NBVECTAPS
  128. #undef NBTAPS
  129. }
  130. static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
  131. const q15_t * __restrict pSrc,
  132. q15_t * __restrict pDst, uint32_t blockSize)
  133. {
  134. #define NBTAPS 8
  135. #define NBVECTAPS (NBTAPS / 8)
  136. FIR_Q15_MAIN_CORE();
  137. #undef NBVECTAPS
  138. #undef NBTAPS
  139. }
  140. void debug_arm_fir_q15(
  141. const arm_fir_instance_q15 * S,
  142. const q15_t * pSrc,
  143. q15_t * pDst,
  144. uint32_t blockSize)
  145. {
  146. q15_t *pState = S->pState; /* State pointer */
  147. const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
  148. q15_t *pStateCur; /* Points to the current sample of the state */
  149. const q15_t *pSamples; /* Temporary pointer to the sample buffer */
  150. q15_t *pOutput; /* Temporary pointer to the output buffer */
  151. const q15_t *pTempSrc; /* Temporary pointer to the source data */
  152. q15_t *pTempDest; /* Temporary pointer to the destination buffer */
  153. uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
  154. uint32_t blkCnt;
  155. q15x8_t vecIn0;
  156. uint32_t tapsBlkCnt = (numTaps + 7) / 8;
  157. q63_t acc0, acc1, acc2, acc3;
  158. int32_t nbTaps = (numTaps + 7) >> 3;
  159. switch(nbTaps) {
  160. case 1:
  161. arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
  162. return;
  163. case 2:
  164. arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
  165. return;
  166. case 3:
  167. arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
  168. return;
  169. case 4:
  170. arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
  171. return;
  172. }
  173. /*
  174. * pState points to state array which contains previous frame (numTaps - 1) samples
  175. * pStateCur points to the location where the new input data should be written
  176. */
  177. pStateCur = &(pState[(numTaps - 1u)]);
  178. pTempSrc = pSrc;
  179. pSamples = pState;
  180. pOutput = pDst;
  181. blkCnt = blockSize >> 2;
  182. while (blkCnt > 0U)
  183. {
  184. const q15_t *pCoeffsTmp = pCoeffs;
  185. const q15_t *pSamplesTmp = pSamples;
  186. acc0 = 0LL;
  187. acc1 = 0LL;
  188. acc2 = 0LL;
  189. acc3 = 0LL;
  190. /*
  191. * Save 8 input samples in the history buffer
  192. */
  193. vst1q(pStateCur, vld1q(pTempSrc));
  194. pStateCur += 8;
  195. pTempSrc += 8;
  196. //INIT_SYSTICK;
  197. //START_CYCLE_MEASUREMENT;
  198. int i = tapsBlkCnt;
  199. //startSectionNB(3);
  200. while (i > 0)
  201. {
  202. /*
  203. * load 8 coefs
  204. */
  205. q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
  206. vecIn0 = vld1q(pSamplesTmp);
  207. acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
  208. vecIn0 = vld1q(&pSamplesTmp[1]);
  209. acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
  210. vecIn0 = vld1q(&pSamplesTmp[2]);
  211. acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
  212. vecIn0 = vld1q(&pSamplesTmp[3]);
  213. acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
  214. pSamplesTmp += 8;
  215. pCoeffsTmp += 8;
  216. /*
  217. * Decrement the taps block loop counter
  218. */
  219. i--;
  220. }
  221. //stopSectionNB(3);
  222. //STOP_CYCLE_MEASUREMENT;
  223. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
  224. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
  225. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
  226. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
  227. pSamples += 4;
  228. /*
  229. * Decrement the sample block loop counter
  230. */
  231. blkCnt--;
  232. }
  233. uint32_t residual = blockSize & 3;
  234. switch (residual)
  235. {
  236. case 3:
  237. {
  238. const q15_t *pCoeffsTmp = pCoeffs;
  239. const q15_t *pSamplesTmp = pSamples;
  240. acc0 = 0LL;
  241. acc1 = 0LL;
  242. acc2 = 0LL;
  243. /*
  244. * Save 8 input samples in the history buffer
  245. */
  246. *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
  247. pStateCur += 8;
  248. pTempSrc += 8;
  249. int i = tapsBlkCnt;
  250. while (i > 0)
  251. {
  252. /*
  253. * load 8 coefs
  254. */
  255. q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
  256. vecIn0 = vld1q(pSamplesTmp);
  257. acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
  258. vecIn0 = vld1q(&pSamplesTmp[2]);
  259. acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
  260. vecIn0 = vld1q(&pSamplesTmp[4]);
  261. acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
  262. pSamplesTmp += 8;
  263. pCoeffsTmp += 8;
  264. /*
  265. * Decrement the taps block loop counter
  266. */
  267. i--;
  268. }
  269. acc0 = asrl(acc0, 15);
  270. acc1 = asrl(acc1, 15);
  271. acc2 = asrl(acc2, 15);
  272. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
  273. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
  274. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
  275. }
  276. break;
  277. case 2:
  278. {
  279. const q15_t *pCoeffsTmp = pCoeffs;
  280. const q15_t *pSamplesTmp = pSamples;
  281. acc0 = 0LL;
  282. acc1 = 0LL;
  283. /*
  284. * Save 8 input samples in the history buffer
  285. */
  286. vst1q(pStateCur, vld1q(pTempSrc));
  287. pStateCur += 8;
  288. pTempSrc += 8;
  289. int i = tapsBlkCnt;
  290. while (i > 0)
  291. {
  292. /*
  293. * load 8 coefs
  294. */
  295. q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
  296. vecIn0 = vld1q(pSamplesTmp);
  297. acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
  298. vecIn0 = vld1q(&pSamplesTmp[2]);
  299. acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
  300. pSamplesTmp += 8;
  301. pCoeffsTmp += 8;
  302. /*
  303. * Decrement the taps block loop counter
  304. */
  305. i--;
  306. }
  307. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
  308. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
  309. }
  310. break;
  311. case 1:
  312. {
  313. const q15_t *pCoeffsTmp = pCoeffs;
  314. const q15_t *pSamplesTmp = pSamples;
  315. acc0 = 0LL;
  316. /*
  317. * Save 8 input samples in the history buffer
  318. */
  319. vst1q(pStateCur, vld1q(pTempSrc));
  320. pStateCur += 8;
  321. pTempSrc += 8;
  322. int i = tapsBlkCnt;
  323. while (i > 0)
  324. {
  325. /*
  326. * load 8 coefs
  327. */
  328. q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
  329. vecIn0 = vld1q(pSamplesTmp);
  330. acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
  331. pSamplesTmp += 8;
  332. pCoeffsTmp += 8;
  333. /*
  334. * Decrement the taps block loop counter
  335. */
  336. i--;
  337. }
  338. *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
  339. }
  340. break;
  341. }
  342. /*
  343. * Copy the samples back into the history buffer start
  344. */
  345. pTempSrc = &pState[blockSize];
  346. pTempDest = pState;
  347. blkCnt = numTaps >> 3;
  348. while (blkCnt > 0U)
  349. {
  350. vst1q(pTempDest, vld1q(pTempSrc));
  351. pTempSrc += 8;
  352. pTempDest += 8;
  353. blkCnt--;
  354. }
  355. blkCnt = numTaps & 7;
  356. if (blkCnt > 0U)
  357. {
  358. mve_pred16_t p0 = vctp16q(blkCnt);
  359. vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
  360. }
  361. }
  362. #endif
  363. template<typename T>
  364. struct FirType;
  365. template<>
  366. struct FirType<float32_t>
  367. {
  368. typedef arm_fir_instance_f32 type;
  369. static void init_state(type * S,
  370. uint16_t numTaps,
  371. const float32_t * pCoeffs,
  372. float32_t * pState,
  373. uint32_t blockSize)
  374. {
  375. arm_fir_init_f32(S,numTaps,pCoeffs,pState,blockSize);
  376. };
  377. static void init_coef(float32_t *coefs,uint16_t numTaps)
  378. {
  379. for(int i=0;i<numTaps;i++)
  380. {
  381. coefs[i] = 1.0f / numTaps;
  382. }
  383. };
  384. };
  385. template<>
  386. struct FirType<Q15>
  387. {
  388. typedef arm_fir_instance_q15 type;
  389. static void init_state(type * S,
  390. uint16_t numTaps,
  391. const Q15 * pCoeffs,
  392. Q15 * pState,
  393. uint32_t blockSize)
  394. {
  395. arm_fir_init_q15(S,numTaps,
  396. reinterpret_cast<const q15_t*>(pCoeffs),
  397. reinterpret_cast<q15_t*>(pState),blockSize);
  398. };
  399. static void init_coef(Q15 *coefs,uint16_t numTaps)
  400. {
  401. for(int i=0;i<numTaps;i++)
  402. {
  403. coefs[i] = Q15::f(1.0f / numTaps);
  404. }
  405. };
  406. };
  407. template<typename T,int BLOCK,int TAPS>
  408. struct FIR {
  409. FIR(const PVector<T,TAPS> &coefs):coef_(coefs),state_(T{})
  410. {};
  411. PVector<T,BLOCK> filter(const PVector<T,BLOCK> &signal)
  412. {
  413. constexpr int UNROLL_FACTOR = 4;
  414. PVector<T,BLOCK> res(T{});
  415. using acc_type = typename number_traits<T>::accumulator;
  416. std::array<acc_type,UNROLL_FACTOR> accu;
  417. index_t i=0;
  418. #if defined(ARM_COMPUTE_DISABLE_UNROLL)
  419. #pragma clang loop unroll(disable)
  420. #endif
  421. for(;i<=BLOCK-UNROLL_FACTOR;i+=UNROLL_FACTOR)
  422. {
  423. state_.sub(TAPS-1+i,TAPS-1+i+UNROLL_FACTOR) = copy(signal.sub(i,i+UNROLL_FACTOR));
  424. //INIT_SYSTICK;
  425. //START_CYCLE_MEASUREMENT;
  426. //startSectionNB(2);
  427. results(accu) =
  428. dot(unroll<UNROLL_FACTOR>(
  429. [i,this](index_t k){return state_.sub(i+k,i+k+TAPS);}),
  430. replicate<UNROLL_FACTOR>(coef_)
  431. );
  432. //stopSectionNB(2);
  433. //STOP_CYCLE_MEASUREMENT;
  434. for(index_t k=0;k<UNROLL_FACTOR;k++)
  435. {
  436. res[i+k] = inner::from_accumulator(accu[k]);
  437. }
  438. }
  439. #if defined(ARM_COMPUTE_DISABLE_UNROLL)
  440. #pragma clang loop unroll(disable)
  441. #endif
  442. for(;i<BLOCK;i++)
  443. {
  444. state_[TAPS-1+i] = signal[i];
  445. acc_type acc = dot(state_.sub(i,i+TAPS),coef_);
  446. res[i] = inner::from_accumulator(acc);
  447. }
  448. state_.sub(0,TAPS) = copy(state_.sub(BLOCK,TAPS+BLOCK));
  449. return(res);
  450. }
  451. void purec(const T *signal, T *dst)
  452. {
  453. const T* coef=coef_.const_ptr();
  454. T *state = state_.ptr();
  455. #if defined(ARM_COMPUTE_DISABLE_UNROLL)
  456. #pragma clang loop unroll(disable)
  457. #endif
  458. for(index_t i=0;i<BLOCK;i++)
  459. {
  460. T acc=T{};
  461. state[TAPS-1+i] = signal[i];
  462. for(index_t k=0;k<TAPS;k++)
  463. {
  464. acc += state[i+k]*coef[k];
  465. }
  466. dst[i] = acc;
  467. }
  468. for(index_t i=0;i<TAPS-1;i++)
  469. {
  470. state[i] = state[BLOCK+i];
  471. }
  472. }
  473. const PVector<T,TAPS> coef_;
  474. PVector<T,TAPS+BLOCK-1> state_;
  475. };
  476. template<typename T,int BLOCK,int TAPS>
  477. static void test()
  478. {
  479. constexpr int NB = BLOCK;
  480. std::cout << "----\r\n(" << BLOCK << "," << TAPS << ")\r\n";
  481. typename FirType<T>::type S;
  482. PVector<T,NB> signal;
  483. PVector<T,TAPS> coefs;
  484. FirType<T>::init_coef(coefs.ptr(),TAPS);
  485. init_array(signal,NB);
  486. FIR<T,BLOCK,TAPS> fir(coefs);
  487. INIT_SYSTICK;
  488. START_CYCLE_MEASUREMENT;
  489. startSectionNB(1);
  490. PVector<T,BLOCK> res = fir.filter(signal);
  491. //PVector<T,BLOCK> res;
  492. //fir.purec(signal.const_ptr(),res.ptr());
  493. stopSectionNB(1);
  494. STOP_CYCLE_MEASUREMENT;
  495. T* state;
  496. T* coefsb;
  497. state=(T*)malloc(sizeof(T)*(TAPS+BLOCK+BLOCK));
  498. coefsb=(T*)malloc(sizeof(T)*(TAPS+32));
  499. memset(coefsb,0,sizeof(T)*(TAPS+32));
  500. for(int i =0;i<TAPS;i++)
  501. {
  502. coefsb[i] = coefs[i];
  503. }
  504. FirType<T>::init_state(&S,TAPS,coefsb,state,BLOCK);
  505. PVector<T,NB> ref;
  506. //std::cout << "---\r\n";
  507. INIT_SYSTICK;
  508. START_CYCLE_MEASUREMENT;
  509. arm_fir_q15(&S,
  510. reinterpret_cast<const q15_t*>(signal.const_ptr()),
  511. reinterpret_cast<q15_t*>(ref.ptr()),BLOCK);
  512. STOP_CYCLE_MEASUREMENT;
  513. if (!validate(res.const_ptr(),ref.const_ptr(),BLOCK))
  514. {
  515. printf("fir failed \r\n");
  516. }
  517. free(state);
  518. free(coefsb);
  519. }
  520. template<typename T>
  521. void all_filter_test()
  522. {
  523. title<T>("FIR test");
  524. test<T,NBVEC_4,8>();
  525. test<T,NBVEC_4,16>();
  526. test<T,NBVEC_4,24>();
  527. test<T,NBVEC_4,32>();
  528. test<T,NBVEC_4,64>();
  529. test<T,NBVEC_32,8>();
  530. test<T,NBVEC_32,16>();
  531. test<T,NBVEC_32,24>();
  532. test<T,NBVEC_32,32>();
  533. test<T,NBVEC_32,64>();
  534. test<T,NBVEC_256,8>();
  535. test<T,NBVEC_256,16>();
  536. test<T,NBVEC_256,24>();
  537. test<T,NBVEC_256,32>();
  538. test<T,NBVEC_256,64>();
  539. test<T,NBVEC_512,8>();
  540. test<T,NBVEC_512,16>();
  541. test<T,NBVEC_512,24>();
  542. test<T,NBVEC_512,32>();
  543. test<T,NBVEC_512,64>();
  544. }
  545. void filter_test()
  546. {
  547. //all_filter_test<Q15>();
  548. }