arm_vec_filtering.h 121 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586
  1. /******************************************************************************
  2. * @file arm_vec_filtering.h
  3. * @brief Private header file for CMSIS DSP Library
  4. * @version V1.7.0
  5. * @date 30. October 2019
  6. ******************************************************************************/
  7. /*
  8. * Copyright (c) 2010-2019 Arm Limited or its affiliates. All rights reserved.
  9. *
  10. * SPDX-License-Identifier: Apache-2.0
  11. *
  12. * Licensed under the Apache License, Version 2.0 (the License); you may
  13. * not use this file except in compliance with the License.
  14. * You may obtain a copy of the License at
  15. *
  16. * www.apache.org/licenses/LICENSE-2.0
  17. *
  18. * Unless required by applicable law or agreed to in writing, software
  19. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  20. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  21. * See the License for the specific language governing permissions and
  22. * limitations under the License.
  23. */
  24. #ifndef _ARM_VEC_FILTERING_H_
  25. #define _ARM_VEC_FILTERING_H_
  26. #include "arm_math.h"
  27. #include "arm_helium_utils.h"
  28. #ifdef __cplusplus
  29. extern "C"
  30. {
  31. #endif
  32. #if (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)
  33. #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, count)\
  34. { \
  35. float32_t const *pSrcX, *pSrcY; \
  36. f32x4_t acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec; \
  37. uint32_t k; \
  38. \
  39. acc0Vec = vdupq_n_f32(0.0f); \
  40. acc1Vec = vdupq_n_f32(0.0f); \
  41. acc2Vec = vdupq_n_f32(0.0f); \
  42. acc3Vec = vdupq_n_f32(0.0f); \
  43. pSrcX = (float32_t const *) pX; \
  44. pSrcY = (float32_t const *) pY; \
  45. k = count >> 2; \
  46. \
  47. while (k > 0U) \
  48. { \
  49. yVec = vld1q(pSrcY); \
  50. pSrcY += 4; \
  51. xVec = vldrwq_f32(&pSrcX[1]); \
  52. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  53. xVec = vldrwq_f32(&pSrcX[2]); \
  54. acc2Vec = vfmaq_f32(acc2Vec, xVec, yVec); \
  55. xVec = vldrwq_f32(&pSrcX[3]); \
  56. acc3Vec = vfmaq_f32(acc3Vec, xVec, yVec); \
  57. xVec = vld1q(pSrcX); \
  58. pSrcX += 4; \
  59. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  60. /* Decrement the loop counter */ \
  61. k--; \
  62. } \
  63. /* loop + tail predication expected here */ \
  64. k = count % 0x4U; \
  65. if (k > 0U) \
  66. { \
  67. mve_pred16_t p0 = vctp32q(k); \
  68. yVec = vld1q(pSrcY); \
  69. pSrcY += 4; \
  70. xVec = vldrwq_f32(&pSrcX[1]); \
  71. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
  72. xVec = vldrwq_f32(&pSrcX[2]); \
  73. acc2Vec = vfmaq_m_f32(acc2Vec, xVec, yVec, p0); \
  74. xVec = vldrwq_f32(&pSrcX[3]); \
  75. acc3Vec = vfmaq_m_f32(acc3Vec, xVec, yVec, p0); \
  76. xVec = vld1q(pSrcX); \
  77. pSrcX += 4; \
  78. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
  79. } \
  80. \
  81. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  82. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  83. acc2 = vecAddAcrossF32Mve(acc2Vec); \
  84. acc3 = vecAddAcrossF32Mve(acc3Vec); \
  85. }
  86. #define MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count) \
  87. { \
  88. float32_t const *pSrcX, *pSrcY; \
  89. f32x4_t accVec, xVec, yVec; \
  90. uint32_t k; \
  91. \
  92. accVec = vdupq_n_f32(0.0f); \
  93. pSrcX = (float32_t const *) pX; \
  94. pSrcY = (float32_t const *) pY; \
  95. k = count >> 2; \
  96. \
  97. while (k > 0U) \
  98. { \
  99. yVec = vld1q(pSrcY); \
  100. pSrcY += 4; \
  101. xVec = vld1q(pSrcX); \
  102. pSrcX += 4; \
  103. accVec = vfmaq_f32(accVec, xVec, yVec); \
  104. /* Decrement the loop counter */ \
  105. k--; \
  106. } \
  107. /* Loop with tail predication expected here */ \
  108. k = count % 0x4U; \
  109. if (k > 0U) \
  110. { \
  111. mve_pred16_t p0 = vctp32q(k); \
  112. yVec = vld1q(pSrcY); \
  113. pSrcY += 4; \
  114. xVec = vld1q(pSrcX); \
  115. pSrcX += 4; \
  116. accVec = vfmaq_m_f32(accVec, xVec, yVec, p0);\
  117. } \
  118. acc = vecAddAcrossF32Mve(accVec); \
  119. }
  120. #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count)\
  121. { \
  122. float32_t const *pSrcX, *pSrcY; \
  123. f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
  124. uint32_t k; \
  125. \
  126. acc0Vec = vdupq_n_f32(0.0f); \
  127. acc1Vec = vdupq_n_f32(0.0f); \
  128. pSrcX = (float32_t const *) pX; \
  129. pSrcY = (float32_t const *) pY; \
  130. k = (count-1) >> 2; \
  131. \
  132. while (k > 0U) \
  133. { \
  134. yVec = vld1q(pSrcY); \
  135. pSrcY += 4; \
  136. xVec = vldrwq_f32(&pSrcX[1]); \
  137. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  138. xVec = vld1q(pSrcX); \
  139. pSrcX += 4; \
  140. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  141. /* Decrement the loop counter */ \
  142. k--; \
  143. } \
  144. /* use predication to finalize MAC sum */ \
  145. /* acc1 requires exact number of sample (count-1) */ \
  146. /* disable extra lanes in final MAC computation */ \
  147. k = (count-1) % 0x4U; \
  148. mve_pred16_t p0 = vctp32q(k); \
  149. yVec = vld1q(pSrcY); \
  150. pSrcY += 4; \
  151. xVec = vldrwq_f32(&pSrcX[1]); \
  152. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
  153. /* acc0 requires 1 additional sample (count) */ \
  154. /* so add 1 to unmask an extra lane in final MAC computation */ \
  155. p0 = vctp32q(k+1); \
  156. xVec = vld1q(pSrcX); \
  157. pSrcX += 4; \
  158. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
  159. \
  160. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  161. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  162. }
  163. #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count)\
  164. { \
  165. float32_t const *pSrcX, *pSrcY; \
  166. f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
  167. uint32_t k; \
  168. \
  169. acc0Vec = vdupq_n_f32(0.0f); \
  170. acc1Vec = vdupq_n_f32(0.0f); \
  171. pSrcX = (float32_t const *) pX; \
  172. pSrcY = (float32_t const *) pY; \
  173. k = count >> 2; \
  174. \
  175. while (k > 0U) \
  176. { \
  177. yVec = vld1q(pSrcY); \
  178. pSrcY += 4; \
  179. xVec = vldrwq_f32(&pSrcX[1]); \
  180. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  181. xVec = vld1q(pSrcX); \
  182. pSrcX += 4; \
  183. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  184. /* Decrement the loop counter */ \
  185. k--; \
  186. } \
  187. /* loop + tail predication expected here */ \
  188. k = count % 0x4U; \
  189. if (k > 0U) \
  190. { \
  191. mve_pred16_t p0 = vctp32q(k); \
  192. yVec = vld1q(pSrcY); \
  193. pSrcY += 4; \
  194. xVec = vldrwq_f32(&pSrcX[1]); \
  195. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
  196. xVec = vld1q(pSrcX); \
  197. pSrcX += 4; \
  198. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
  199. } \
  200. \
  201. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  202. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  203. }
  204. #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
  205. { \
  206. float32_t const *pSrcX, *pSrcY; \
  207. f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
  208. uint32_t k; \
  209. \
  210. acc0Vec = vdupq_n_f32(0.0f); \
  211. acc1Vec = vdupq_n_f32(0.0f); \
  212. pSrcX = (float32_t const *) pX; \
  213. pSrcY = (float32_t const *) pY; \
  214. k = count >> 2; \
  215. while (k > 0U) \
  216. { \
  217. xVec = vld1q(pSrcX); \
  218. pSrcX += 4; \
  219. yVec = vldrwq_f32(&pSrcY[-1]); \
  220. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  221. yVec = vld1q(pSrcY); \
  222. pSrcY += 4; \
  223. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  224. /* Decrement the loop counter */ \
  225. k--; \
  226. } \
  227. k = count % 0x4U; \
  228. /* use predication to finalize MAC sum */ \
  229. /* acc1 requires 1 additional sample */ \
  230. /* so add 1 to unmask an extra lane in final MAC computation */ \
  231. mve_pred16_t p0 = vctp32q(k+1); \
  232. xVec = vld1q(pSrcX); \
  233. pSrcX += 4; \
  234. yVec = vldrwq_f32(&pSrcY[-1]); \
  235. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec,p0); \
  236. /* acc0 requires exact number of sample */ \
  237. /* disable extra lanes in final MAC computation */ \
  238. p0 = vctp32q(k); \
  239. yVec = vld1q(pSrcY); \
  240. pSrcY += 4; \
  241. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec,p0); \
  242. \
  243. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  244. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  245. }
  246. #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count) \
  247. { \
  248. float32_t const *pSrcX; \
  249. f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
  250. uint32_t k; \
  251. \
  252. acc0Vec = vdupq_n_f32(0.0f); \
  253. acc1Vec = vdupq_n_f32(0.0f); \
  254. pSrcX = (float32_t const *) pX; \
  255. k = (count - 1) >> 2; \
  256. \
  257. while (k > 0U) \
  258. { \
  259. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  260. pY-=4; \
  261. xVec = vldrwq_f32(&pSrcX[1]); \
  262. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  263. xVec = vld1q(pSrcX); pSrcX += 4; \
  264. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  265. /* Decrement the loop counter */ \
  266. k--; \
  267. } \
  268. /* Loop with tail predication expected here */ \
  269. k = (count - 1) % 0x4U; \
  270. mve_pred16_t p0 = vctp32q(k); \
  271. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  272. xVec = vldrwq_f32(&pSrcX[1]); \
  273. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
  274. xVec = vld1q(pSrcX); pSrcX += 4; \
  275. p0 = vctp32q(k+1); \
  276. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
  277. \
  278. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  279. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  280. }
  281. #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count) \
  282. { \
  283. float32_t const *pSrcX; \
  284. f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
  285. uint32_t k; \
  286. \
  287. acc0Vec = vdupq_n_f32(0.0f); \
  288. acc1Vec = vdupq_n_f32(0.0f); \
  289. pSrcX = (float32_t const *) pX; \
  290. k = count >> 2; \
  291. \
  292. while (k > 0U) \
  293. { \
  294. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  295. pY-=4; \
  296. xVec = vldrwq_f32(&pSrcX[1]); \
  297. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  298. xVec = vld1q(pSrcX); pSrcX += 4; \
  299. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  300. /* Decrement the loop counter */ \
  301. k--; \
  302. } \
  303. /* Loop with tail predication expected here */ \
  304. k = count % 0x4U; \
  305. if (k > 0U) \
  306. { \
  307. mve_pred16_t p0 = vctp32q(k); \
  308. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  309. xVec = vldrwq_f32(&pSrcX[1]); \
  310. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
  311. xVec = vld1q(pSrcX); pSrcX += 4; \
  312. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
  313. } \
  314. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  315. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  316. }
  317. #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count)\
  318. { \
  319. float32_t const *pSrcX; \
  320. const float32_t *pY1 = pY + 1; \
  321. f32x4_t acc0Vec, acc1Vec, xVec, yVec; \
  322. uint32_t k; \
  323. \
  324. acc0Vec = vdupq_n_f32(0.0f); \
  325. acc1Vec = vdupq_n_f32(0.0f); \
  326. pSrcX = (float32_t const *) pX; \
  327. k = count >> 2; \
  328. \
  329. while (k > 0U) \
  330. { \
  331. xVec = vld1q(pSrcX); pSrcX += 4; \
  332. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  333. pY-=4; \
  334. acc0Vec = vfmaq_f32(acc0Vec, xVec, yVec); \
  335. yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \
  336. pY1-=4; \
  337. acc1Vec = vfmaq_f32(acc1Vec, xVec, yVec); \
  338. /* Decrement the loop counter */ \
  339. k--; \
  340. } \
  341. k = count % 0x4U; \
  342. /* use predication to finalize MAC sum */ \
  343. /* acc0 requires exact number of sample */ \
  344. /* disable extra lanes in final MAC computation */ \
  345. mve_pred16_t p0 = vctp32q(k); \
  346. xVec = vld1q(pSrcX); pSrcX += 4; \
  347. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  348. acc0Vec = vfmaq_m_f32(acc0Vec, xVec, yVec, p0); \
  349. yVec = vldrwq_gather_shifted_offset_f32(pY1, decrIdxVec); \
  350. /* acc1 requires 1 additional sample */ \
  351. /* so add 1 to unmask an extra lane in final MAC computation */ \
  352. p0 = vctp32q(k+1); \
  353. acc1Vec = vfmaq_m_f32(acc1Vec, xVec, yVec, p0); \
  354. \
  355. acc0 = vecAddAcrossF32Mve(acc0Vec); \
  356. acc1 = vecAddAcrossF32Mve(acc1Vec); \
  357. }
  358. #define MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count) \
  359. { \
  360. float32_t const *pSrcX; \
  361. f32x4_t accVec, xVec, yVec; \
  362. uint32_t k; \
  363. \
  364. accVec = vdupq_n_f32(0.0f); \
  365. pSrcX = (float32_t const *) pX; \
  366. k = count >> 2; \
  367. \
  368. while (k > 0U) \
  369. { \
  370. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  371. pY-=4; \
  372. xVec = vld1q(pSrcX); pSrcX += 4; \
  373. accVec = vfmaq_f32(accVec, xVec, yVec); \
  374. /* Decrement the loop counter */ \
  375. k--; \
  376. } \
  377. /* Loop with tail predication expected here */ \
  378. k = count % 0x4U; \
  379. if (k > 0U) \
  380. { \
  381. mve_pred16_t p0 = vctp32q(k); \
  382. xVec = vld1q(pSrcX); pSrcX += 4; \
  383. yVec = vldrwq_gather_shifted_offset_f32(pY, decrIdxVec); \
  384. accVec = vfmaq_m_f32(accVec, xVec, yVec, p0); \
  385. } \
  386. acc = vecAddAcrossF32Mve(accVec); \
  387. }
  388. #endif /* (defined(ARM_MATH_MVEF) || defined(ARM_MATH_HELIUM)) && !defined(ARM_MATH_AUTOVECTORIZE)*/
  389. #if (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM))
  390. #define MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count) \
  391. { \
  392. q31_t const *pSrcX; \
  393. q31x4_t xVec, yVec; \
  394. uint32_t k; \
  395. \
  396. pSrcX = (q31_t const *) pX; \
  397. k = count >> 2; \
  398. \
  399. while (k > 0U) \
  400. { \
  401. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  402. pY-=4; \
  403. xVec = vld1q(pSrcX); pSrcX += 4; \
  404. acc = vmlaldavaq(acc, xVec, yVec); \
  405. /* Decrement the loop counter */ \
  406. k--; \
  407. } \
  408. /* Loop with tail predication expected here */ \
  409. k = count % 0x4U; \
  410. if (k > 0U) \
  411. { \
  412. mve_pred16_t p0 = vctp32q(k); \
  413. xVec = vld1q(pSrcX); pSrcX += 4; \
  414. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  415. acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
  416. } \
  417. acc = asrl(acc, 31); \
  418. }
  419. #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
  420. { \
  421. q31_t const *pSrcX; \
  422. const q31_t *pY1 = pY + 1; \
  423. q31x4_t xVec, yVec; \
  424. uint32_t k; \
  425. \
  426. pSrcX = (q31_t const *) pX; \
  427. k = count >> 2; \
  428. \
  429. while (k > 0U) \
  430. { \
  431. xVec = vld1q(pSrcX); pSrcX += 4; \
  432. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  433. pY-=4; \
  434. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  435. yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \
  436. pY1-=4; \
  437. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  438. /* Decrement the loop counter */ \
  439. k--; \
  440. } \
  441. k = count % 0x4U; \
  442. /* use predication to finalize MAC sum */ \
  443. /* acc0 requires exact number of sample */ \
  444. /* disable extra lanes in final MAC computation */ \
  445. mve_pred16_t p0 = vctp32q(k); \
  446. xVec = vld1q(pSrcX); pSrcX += 4; \
  447. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  448. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  449. yVec = vldrwq_gather_shifted_offset_s32(pY1, decrIdxVec); \
  450. /* acc1 requires 1 additional sample */ \
  451. /* so add 1 to unmask an extra lane in final MAC computation */ \
  452. p0 = vctp32q(k+1); \
  453. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  454. \
  455. acc0 = asrl(acc0, 31); \
  456. acc1 = asrl(acc1, 31); \
  457. }
  458. #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count) \
  459. { \
  460. q31_t const *pSrcX; \
  461. q31x4_t xVec, yVec; \
  462. uint32_t k; \
  463. \
  464. pSrcX = (q31_t const *) pX; \
  465. k = (count-1) >> 2; \
  466. \
  467. while (k > 0U) \
  468. { \
  469. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  470. pY-=4; \
  471. xVec = vldrwq_s32(&pSrcX[1]); \
  472. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  473. xVec = vld1q(pSrcX); \
  474. pSrcX += 4; \
  475. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  476. /* Decrement the loop counter */ \
  477. k--; \
  478. } \
  479. k = (count - 1) % 0x4U; \
  480. /* use predication to finalize MAC sum */ \
  481. /* acc1 requires exact number of sample (count-1) */ \
  482. /* disable extra lanes in final MAC computation */ \
  483. mve_pred16_t p0 = vctp32q(k); \
  484. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  485. xVec = vldrwq_s32(&pSrcX[1]); \
  486. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  487. /* acc0 requires 1 additional sample (count) */ \
  488. /* so add 1 to unmask an extra lane in final MAC computation */ \
  489. p0 = vctp32q(k+1); \
  490. xVec = vld1q(pSrcX); \
  491. pSrcX += 4; \
  492. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  493. \
  494. acc0 = asrl(acc0, 31); \
  495. acc1 = asrl(acc1, 31); \
  496. }
  497. #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count) \
  498. { \
  499. q31_t const *pSrcX; \
  500. q31x4_t xVec, yVec; \
  501. uint32_t k; \
  502. \
  503. pSrcX = (q31_t const *) pX; \
  504. k = count >> 2; \
  505. \
  506. while (k > 0U) \
  507. { \
  508. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  509. pY-=4; \
  510. xVec = vldrwq_s32(&pSrcX[1]); \
  511. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  512. xVec = vld1q(pSrcX); pSrcX += 4; \
  513. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  514. /* Decrement the loop counter */ \
  515. k--; \
  516. } \
  517. /* Loop with tail predication expected here */ \
  518. k = count % 0x4U; \
  519. if (k > 0U) \
  520. { \
  521. mve_pred16_t p0 = vctp32q(k); \
  522. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  523. xVec = vldrwq_s32(&pSrcX[1]); \
  524. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  525. xVec = vld1q(pSrcX); pSrcX += 4; \
  526. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  527. } \
  528. acc0 = asrl(acc0, 31); \
  529. acc1 = asrl(acc1, 31); \
  530. }
  531. #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count) \
  532. { \
  533. q31_t const *pSrcX; \
  534. q31x4_t xVec, yVec; \
  535. uint32_t k; \
  536. \
  537. pSrcX = (q31_t const *) pX; \
  538. k = count >> 2; \
  539. \
  540. while (k > 0U) \
  541. { \
  542. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  543. pY-=4; \
  544. xVec = vldrwq_s32(&pSrcX[1]); \
  545. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  546. xVec = vldrwq_s32(&pSrcX[2]); \
  547. acc2 = vmlaldavaq(acc2, xVec, yVec); \
  548. xVec = vldrwq_s32(&pSrcX[3]); \
  549. acc3 = vmlaldavaq(acc3, xVec, yVec); \
  550. xVec = vld1q(pSrcX); pSrcX += 4; \
  551. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  552. /* Decrement the loop counter */ \
  553. k--; \
  554. } \
  555. /* Loop with tail predication expected here */ \
  556. k = count % 0x4U; \
  557. if (k > 0U) \
  558. { \
  559. mve_pred16_t p0 = vctp32q(k); \
  560. yVec = vldrwq_gather_shifted_offset_s32(pY, decrIdxVec); \
  561. xVec = vldrwq_s32(&pSrcX[1]); \
  562. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  563. xVec = vldrwq_s32(&pSrcX[2]); \
  564. acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
  565. xVec = vldrwq_s32(&pSrcX[3]); \
  566. acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
  567. xVec = vld1q(pSrcX); pSrcX += 4; \
  568. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  569. } \
  570. acc0 = asrl(acc0, 31); \
  571. acc1 = asrl(acc1, 31); \
  572. acc2 = asrl(acc2, 31); \
  573. acc3 = asrl(acc3, 31); \
  574. }
  575. #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count)\
  576. { \
  577. q31_t const *pSrcX, *pSrcY; \
  578. q31x4_t xVec, yVec; \
  579. uint32_t k; \
  580. \
  581. pSrcX = (q31_t const *) pX; \
  582. pSrcY = (q31_t const *) pY; \
  583. k = count >> 2; \
  584. \
  585. while (k > 0U) \
  586. { \
  587. xVec = vld1q(pSrcX); pSrcX += 4; \
  588. yVec = vldrwq_s32(&pSrcY[-1]); \
  589. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  590. yVec = vld1q(pSrcY); pSrcY += 4; \
  591. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  592. /* Decrement the loop counter */ \
  593. k--; \
  594. } \
  595. k = count % 0x4U; \
  596. /* use predication to finalize MAC sum */ \
  597. /* acc1 requires 1 additional sample */ \
  598. /* so add 1 to unmask an extra lane in final MAC computation */ \
  599. mve_pred16_t p0 = vctp32q(k+1); \
  600. xVec = vld1q(pSrcX); pSrcX += 4; \
  601. yVec = vldrwq_s32(&pSrcY[-1]); \
  602. acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \
  603. /* acc0 requires exact number of sample */ \
  604. /* disable extra lanes in final MAC computation */ \
  605. p0 = vctp32q(k); \
  606. yVec = vld1q(pSrcY); pSrcY += 4; \
  607. acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \
  608. \
  609. acc0 = asrl(acc0, 31); \
  610. acc1 = asrl(acc1, 31); \
  611. }
  612. #define MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count)\
  613. { \
  614. q31_t const *pSrcX, *pSrcY; \
  615. q31x4_t xVec, yVec; \
  616. uint32_t k; \
  617. \
  618. pSrcX = (q31_t const *) pX; \
  619. pSrcY = (q31_t const *) pY; \
  620. k = count >> 2; \
  621. \
  622. while (k > 0U) \
  623. { \
  624. xVec = vld1q(pSrcX); pSrcX += 4; \
  625. yVec = vld1q(pSrcY); pSrcY += 4; \
  626. acc = vmlaldavaq(acc, xVec, yVec); \
  627. /* Decrement the loop counter */ \
  628. k--; \
  629. } \
  630. /* tail predication expected here */ \
  631. k = count % 0x4U; \
  632. if (k > 0U) \
  633. { \
  634. mve_pred16_t p0 = vctp32q(k); \
  635. xVec = vld1q(pSrcX); pSrcX += 4; \
  636. yVec = vld1q(pSrcY); pSrcY += 4; \
  637. acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
  638. } \
  639. acc = asrl(acc, 31); \
  640. }
  641. #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count)\
  642. { \
  643. q31_t const *pSrcX, *pSrcY; \
  644. q31x4_t xVec, yVec; \
  645. uint32_t k; \
  646. \
  647. pSrcX = (q31_t const *) pX; \
  648. pSrcY = (q31_t const *) pY; \
  649. k = count >> 2; \
  650. \
  651. while (k > 0U) \
  652. { \
  653. yVec = vld1q(pSrcY); pSrcY += 4; \
  654. xVec = vldrwq_s32(&pSrcX[1]); \
  655. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  656. xVec = vldrwq_s32(&pSrcX[2]); \
  657. acc2 = vmlaldavaq(acc2, xVec, yVec); \
  658. xVec = vldrwq_s32(&pSrcX[3]); \
  659. acc3 = vmlaldavaq(acc3, xVec, yVec); \
  660. xVec = vld1q(pSrcX); pSrcX += 4; \
  661. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  662. /* Decrement the loop counter */ \
  663. k--; \
  664. } \
  665. /* loop + tail predication expected here */ \
  666. k = count % 0x4U; \
  667. if (k > 0U) \
  668. { \
  669. mve_pred16_t p0 = vctp32q(k); \
  670. yVec = vld1q(pSrcY); pSrcY += 4; \
  671. xVec = vldrwq_s32(&pSrcX[1]); \
  672. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  673. xVec = vldrwq_s32(&pSrcX[2]); \
  674. acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
  675. xVec = vldrwq_s32(&pSrcX[3]); \
  676. acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
  677. xVec = vld1q(pSrcX); pSrcX += 4; \
  678. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  679. } \
  680. \
  681. acc0 = asrl(acc0, 31); \
  682. acc1 = asrl(acc1, 31); \
  683. acc2 = asrl(acc2, 31); \
  684. acc3 = asrl(acc3, 31); \
  685. }
  686. #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count)\
  687. { \
  688. q31_t const *pSrcX, *pSrcY; \
  689. q31x4_t xVec, yVec; \
  690. uint32_t k; \
  691. \
  692. pSrcX = (q31_t const *) pX; \
  693. pSrcY = (q31_t const *) pY; \
  694. k = count >> 2; \
  695. \
  696. while (k > 0U) \
  697. { \
  698. yVec = vld1q(pSrcY); pSrcY += 4; \
  699. xVec = vldrwq_s32(&pSrcX[1]); \
  700. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  701. xVec = vld1q(pSrcX); pSrcX += 4; \
  702. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  703. /* Decrement the loop counter */ \
  704. k--; \
  705. } \
  706. /* loop + tail predication expected here */ \
  707. k = count % 0x4U; \
  708. if (k > 0U) \
  709. { \
  710. mve_pred16_t p0 = vctp32q(k); \
  711. yVec = vld1q(pSrcY); pSrcY += 4; \
  712. xVec = vldrwq_s32(&pSrcX[1]); \
  713. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  714. xVec = vld1q(pSrcX); pSrcX += 4; \
  715. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  716. } \
  717. \
  718. acc0 = asrl(acc0, 31); \
  719. acc1 = asrl(acc1, 31); \
  720. }
  721. #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count)\
  722. { \
  723. q31_t const *pSrcX, *pSrcY; \
  724. q31x4_t xVec, yVec; \
  725. uint32_t k; \
  726. \
  727. pSrcX = (q31_t const *) pX; \
  728. pSrcY = (q31_t const *) pY; \
  729. k = (count-1) >> 2; \
  730. \
  731. while (k > 0U) \
  732. { \
  733. yVec = vld1q(pSrcY); pSrcY += 4; \
  734. xVec = vldrwq_s32(&pSrcX[1]); \
  735. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  736. xVec = vld1q(pSrcX); pSrcX += 4; \
  737. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  738. /* Decrement the loop counter */ \
  739. k--; \
  740. } \
  741. /* use predication to finalize MAC sum */ \
  742. /* acc1 requires exact number of sample (count-1) */ \
  743. /* disable extra lanes in final MAC computation */ \
  744. k = (count-1) % 0x4U; \
  745. mve_pred16_t p0 = vctp32q(k); \
  746. yVec = vld1q(pSrcY); pSrcY += 4; \
  747. xVec = vldrwq_s32(&pSrcX[1]); \
  748. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  749. /* acc0 requires 1 additional sample (count) */ \
  750. /* so add 1 to unmask an extra lane in final MAC computation */ \
  751. p0 = vctp32q(k+1); \
  752. xVec = vld1q(pSrcX); pSrcX += 4; \
  753. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  754. \
  755. acc0 = asrl(acc0, 31); \
  756. acc1 = asrl(acc1, 31); \
  757. }
  758. #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
  759. { \
  760. q15_t const *pSrcX, *pSrcY; \
  761. q15x8_t xVec, yVec; \
  762. uint32_t k; \
  763. \
  764. pSrcX = (q15_t const *) pX; \
  765. pSrcY = (q15_t const *) pY; \
  766. k = count >> 3; \
  767. while (k > 0U) \
  768. { \
  769. xVec = vld1q(pSrcX); pSrcX += 8; \
  770. yVec = vldrhq_s16(&pSrcY[-1]); \
  771. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  772. yVec = vld1q(pSrcY); pSrcY += 8; \
  773. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  774. /* Decrement the loop counter */ \
  775. k--; \
  776. } \
  777. k = count % 0x8U; \
  778. /* use predication to finalize MAC sum */ \
  779. /* acc1 requires 1 additional sample */ \
  780. /* so add 1 to unmask an extra lane in final MAC computation */ \
  781. mve_pred16_t p0 = vctp16q(k+1); \
  782. xVec = vld1q(pSrcX); pSrcX += 8; \
  783. yVec = vldrhq_s16(&pSrcY[-1]); \
  784. acc1 = vmlaldavaq_p(acc1, xVec, yVec,p0); \
  785. /* acc0 requires exact number of sample */ \
  786. /* disable extra lanes in final MAC computation */ \
  787. p0 = vctp16q(k); \
  788. yVec = vld1q(pSrcY); pSrcY += 8; \
  789. acc0 = vmlaldavaq_p(acc0, xVec, yVec,p0); \
  790. \
  791. acc0 = asrl(acc0, 15); \
  792. acc1 = asrl(acc1, 15); \
  793. acc0 = __SSAT(acc0, 16); \
  794. acc1 = __SSAT(acc1, 16); \
  795. }
  796. #define MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count)\
  797. { \
  798. q15_t const *pSrcX, *pSrcY; \
  799. q15x8_t xVec, yVec; \
  800. uint32_t k; \
  801. \
  802. pSrcX = (q15_t const *) pX; \
  803. pSrcY = (q15_t const *) pY; \
  804. k = count >> 3; \
  805. while (k > 0U) \
  806. { \
  807. xVec = vld1q(pSrcX); pSrcX += 8; \
  808. yVec = vld1q(pSrcY); pSrcY += 8; \
  809. acc = vmlaldavaq(acc, xVec, yVec); \
  810. /* Decrement the loop counter */ \
  811. k--; \
  812. } \
  813. /* tail predication expected here */ \
  814. k = count % 0x8U; \
  815. if (k > 0U) \
  816. { \
  817. mve_pred16_t p0 = vctp16q(k); \
  818. xVec = vld1q(pSrcX); pSrcX += 8; \
  819. yVec = vld1q(pSrcY); pSrcY += 8; \
  820. acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
  821. } \
  822. acc = asrl(acc, 15); \
  823. acc = __SSAT(acc, 16); \
  824. }
  825. #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count)\
  826. { \
  827. q15_t const *pSrcX, *pSrcY; \
  828. q15x8_t xVec, yVec; \
  829. uint32_t k; \
  830. \
  831. pSrcX = (q15_t const *) pX; \
  832. pSrcY = (q15_t const *) pY; \
  833. k = count >> 3; \
  834. \
  835. while (k > 0U) \
  836. { \
  837. yVec = vld1q(pSrcY); pSrcY += 8; \
  838. xVec = vldrhq_s16(&pSrcX[1]); \
  839. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  840. xVec = vldrhq_s16(&pSrcX[2]); \
  841. acc2 = vmlaldavaq(acc2, xVec, yVec); \
  842. xVec = vldrhq_s16(&pSrcX[3]); \
  843. acc3 = vmlaldavaq(acc3, xVec, yVec); \
  844. xVec = vld1q(pSrcX); pSrcX += 8; \
  845. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  846. /* Decrement the loop counter */ \
  847. k--; \
  848. } \
  849. /* loop + tail predication expected here */ \
  850. k = count % 0x8U; \
  851. if (k > 0U) \
  852. { \
  853. mve_pred16_t p0 = vctp16q(k); \
  854. yVec = vld1q(pSrcY); pSrcY += 8; \
  855. xVec = vldrhq_s16(&pSrcX[1]); \
  856. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  857. xVec = vldrhq_s16(&pSrcX[2]); \
  858. acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
  859. xVec = vldrhq_s16(&pSrcX[3]); \
  860. acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
  861. xVec = vld1q(pSrcX); pSrcX += 8; \
  862. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  863. } \
  864. \
  865. acc0 = asrl(acc0, 15); \
  866. acc1 = asrl(acc1, 15); \
  867. acc2 = asrl(acc2, 15); \
  868. acc3 = asrl(acc3, 15); \
  869. acc0 = __SSAT(acc0, 16); \
  870. acc1 = __SSAT(acc1, 16); \
  871. acc2 = __SSAT(acc2, 16); \
  872. acc3 = __SSAT(acc3, 16); \
  873. }
  874. #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count)\
  875. { \
  876. q15_t const *pSrcX, *pSrcY; \
  877. q15x8_t xVec, yVec; \
  878. uint32_t k; \
  879. \
  880. pSrcX = (q15_t const *) pX; \
  881. pSrcY = (q15_t const *) pY; \
  882. k = count >> 3; \
  883. \
  884. while (k > 0U) \
  885. { \
  886. yVec = vld1q(pSrcY); pSrcY += 8; \
  887. xVec = vldrhq_s16(&pSrcX[1]); \
  888. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  889. xVec = vld1q(pSrcX); pSrcX += 8; \
  890. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  891. /* Decrement the loop counter */ \
  892. k--; \
  893. } \
  894. /* loop + tail predication expected here */ \
  895. k = count % 0x8U; \
  896. if (k > 0U) \
  897. { \
  898. mve_pred16_t p0 = vctp16q(k); \
  899. yVec = vld1q(pSrcY); pSrcY += 8; \
  900. xVec = vldrhq_s16(&pSrcX[1]); \
  901. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  902. xVec = vld1q(pSrcX); pSrcX += 8; \
  903. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  904. } \
  905. \
  906. acc0 = asrl(acc0, 15); \
  907. acc1 = asrl(acc1, 15); \
  908. acc0 = __SSAT(acc0, 16); \
  909. acc1 = __SSAT(acc1, 16); \
  910. }
  911. #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count)\
  912. { \
  913. q15_t const *pSrcX, *pSrcY; \
  914. q15x8_t xVec, yVec; \
  915. uint32_t k; \
  916. \
  917. pSrcX = (q15_t const *) pX; \
  918. pSrcY = (q15_t const *) pY; \
  919. k = (count-1) >> 3; \
  920. \
  921. while (k > 0U) \
  922. { \
  923. yVec = vld1q(pSrcY); pSrcY += 8; \
  924. xVec = vldrhq_s16(&pSrcX[1]); \
  925. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  926. xVec = vld1q(pSrcX); pSrcX += 8; \
  927. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  928. /* Decrement the loop counter */ \
  929. k--; \
  930. } \
  931. /* use predication to finalize MAC sum */ \
  932. /* acc1 requires exact number of sample (count-1) */ \
  933. /* disable extra lanes in final MAC computation */ \
  934. k = (count-1) % 0x8U; \
  935. mve_pred16_t p0 = vctp16q(k); \
  936. yVec = vld1q(pSrcY); pSrcY += 8; \
  937. xVec = vldrhq_s16(&pSrcX[1]); \
  938. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  939. /* acc0 requires 1 additional sample (count) */ \
  940. /* so add 1 to unmask an extra lane in final MAC computation */ \
  941. p0 = vctp16q(k+1); \
  942. xVec = vld1q(pSrcX); pSrcX += 8; \
  943. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  944. \
  945. acc0 = asrl(acc0, 15); \
  946. acc1 = asrl(acc1, 15); \
  947. acc0 = __SSAT(acc0, 16); \
  948. acc1 = __SSAT(acc1, 16); \
  949. }
  950. #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count)\
  951. { \
  952. q15_t const *pSrcX; \
  953. const q15_t *pY1 = pY + 1; \
  954. q15x8_t xVec, yVec; \
  955. uint32_t k; \
  956. \
  957. pSrcX = (q15_t const *) pX; \
  958. k = count >> 3; \
  959. \
  960. while (k > 0U) \
  961. { \
  962. xVec = vld1q(pSrcX); pSrcX += 8; \
  963. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  964. pY-=8; \
  965. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  966. yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \
  967. pY1-=8; \
  968. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  969. /* Decrement the loop counter */ \
  970. k--; \
  971. } \
  972. k = count % 0x8U; \
  973. /* use predication to finalize MAC sum */ \
  974. /* acc0 requires exact number of sample */ \
  975. /* disable extra lanes in final MAC computation */ \
  976. mve_pred16_t p0 = vctp16q(k); \
  977. xVec = vld1q(pSrcX); pSrcX += 8; \
  978. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  979. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  980. yVec = vldrhq_gather_shifted_offset_s16(pY1, decrIdxVec); \
  981. /* acc1 requires 1 additional sample */ \
  982. /* so add 1 to unmask an extra lane in final MAC computation */ \
  983. p0 = vctp16q(k+1); \
  984. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  985. \
  986. acc0 = asrl(acc0, 15); \
  987. acc1 = asrl(acc1, 15); \
  988. acc0 = __SSAT(acc0, 16); \
  989. acc1 = __SSAT(acc1, 16); \
  990. }
  991. #define MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count) \
  992. { \
  993. q15_t const *pSrcX; \
  994. q15x8_t xVec, yVec; \
  995. uint32_t k; \
  996. \
  997. pSrcX = (q15_t const *) pX; \
  998. k = count >> 3; \
  999. \
  1000. while (k > 0U) \
  1001. { \
  1002. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1003. pY-=8; \
  1004. xVec = vld1q(pSrcX); pSrcX += 8; \
  1005. acc = vmlaldavaq(acc, xVec, yVec); \
  1006. /* Decrement the loop counter */ \
  1007. k--; \
  1008. } \
  1009. /* Loop with tail predication expected here */ \
  1010. k = count % 0x8U; \
  1011. if (k > 0U) \
  1012. { \
  1013. mve_pred16_t p0 = vctp16q(k); \
  1014. xVec = vld1q(pSrcX); pSrcX += 8; \
  1015. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1016. acc = vmlaldavaq_p(acc, xVec, yVec, p0); \
  1017. } \
  1018. acc = asrl(acc, 15); \
  1019. acc = __SSAT(acc, 16); \
  1020. }
  1021. #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count) \
  1022. { \
  1023. q15_t const *pSrcX; \
  1024. q15x8_t xVec, yVec; \
  1025. uint32_t k; \
  1026. \
  1027. pSrcX = (q15_t const *) pX; \
  1028. k = count >> 3; \
  1029. \
  1030. while (k > 0U) \
  1031. { \
  1032. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1033. pY-=8; \
  1034. xVec = vldrhq_s16(&pSrcX[1]); \
  1035. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  1036. xVec = vldrhq_s16(&pSrcX[2]); \
  1037. acc2 = vmlaldavaq(acc2, xVec, yVec); \
  1038. xVec = vldrhq_s16(&pSrcX[3]); \
  1039. acc3 = vmlaldavaq(acc3, xVec, yVec); \
  1040. xVec = vld1q(pSrcX); pSrcX += 8; \
  1041. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  1042. /* Decrement the loop counter */ \
  1043. k--; \
  1044. } \
  1045. /* Loop with tail predication expected here */ \
  1046. k = count % 0x8U; \
  1047. if (k > 0U) \
  1048. { \
  1049. mve_pred16_t p0 = vctp16q(k); \
  1050. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1051. xVec = vldrhq_s16(&pSrcX[1]); \
  1052. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  1053. xVec = vldrhq_s16(&pSrcX[2]); \
  1054. acc2 = vmlaldavaq_p(acc2, xVec, yVec, p0); \
  1055. xVec = vldrhq_s16(&pSrcX[3]); \
  1056. acc3 = vmlaldavaq_p(acc3, xVec, yVec, p0); \
  1057. xVec = vld1q(pSrcX); pSrcX += 8; \
  1058. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  1059. } \
  1060. acc0 = asrl(acc0, 15); \
  1061. acc1 = asrl(acc1, 15); \
  1062. acc2 = asrl(acc2, 15); \
  1063. acc3 = asrl(acc3, 15); \
  1064. acc0 = __SSAT(acc0, 16); \
  1065. acc1 = __SSAT(acc1, 16); \
  1066. acc2 = __SSAT(acc2, 16); \
  1067. acc3 = __SSAT(acc3, 16); \
  1068. }
  1069. #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count) \
  1070. { \
  1071. q15_t const *pSrcX; \
  1072. q15x8_t xVec, yVec; \
  1073. uint32_t k; \
  1074. \
  1075. pSrcX = (q15_t const *) pX; \
  1076. k = count >> 3; \
  1077. \
  1078. while (k > 0U) \
  1079. { \
  1080. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1081. pY-=8; \
  1082. xVec = vldrhq_s16(&pSrcX[1]); \
  1083. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  1084. xVec = vld1q(pSrcX); pSrcX += 8; \
  1085. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  1086. /* Decrement the loop counter */ \
  1087. k--; \
  1088. } \
  1089. /* Loop with tail predication expected here */ \
  1090. k = count % 0x8U; \
  1091. if (k > 0U) \
  1092. { \
  1093. mve_pred16_t p0 = vctp16q(k); \
  1094. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1095. xVec = vldrhq_s16(&pSrcX[1]); \
  1096. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  1097. xVec = vld1q(pSrcX); pSrcX += 8; \
  1098. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  1099. } \
  1100. acc0 = asrl(acc0, 15); \
  1101. acc1 = asrl(acc1, 15); \
  1102. acc0 = __SSAT(acc0, 16); \
  1103. acc1 = __SSAT(acc1, 16); \
  1104. }
  1105. #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count) \
  1106. { \
  1107. q15_t const *pSrcX; \
  1108. q15x8_t xVec, yVec; \
  1109. uint32_t k; \
  1110. \
  1111. pSrcX = (q15_t const *) pX; \
  1112. k = (count-1) >> 3; \
  1113. \
  1114. while (k > 0U) \
  1115. { \
  1116. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1117. pY-=8; \
  1118. xVec = vldrhq_s16(&pSrcX[1]); \
  1119. acc1 = vmlaldavaq(acc1, xVec, yVec); \
  1120. xVec = vld1q(pSrcX); pSrcX += 8; \
  1121. acc0 = vmlaldavaq(acc0, xVec, yVec); \
  1122. /* Decrement the loop counter */ \
  1123. k--; \
  1124. } \
  1125. k = (count - 1) % 0x8U; \
  1126. /* use predication to finalize MAC sum */ \
  1127. /* acc1 requires exact number of sample (count-1) */ \
  1128. /* disable extra lanes in final MAC computation */ \
  1129. mve_pred16_t p0 = vctp16q(k); \
  1130. yVec = vldrhq_gather_shifted_offset_s16(pY, decrIdxVec); \
  1131. xVec = vldrhq_s16(&pSrcX[1]); \
  1132. acc1 = vmlaldavaq_p(acc1, xVec, yVec, p0); \
  1133. /* acc0 requires 1 additional sample (count) */ \
  1134. /* so add 1 to unmask an extra lane in final MAC computation */ \
  1135. p0 = vctp16q(k+1); \
  1136. xVec = vld1q(pSrcX); pSrcX += 8; \
  1137. acc0 = vmlaldavaq_p(acc0, xVec, yVec, p0); \
  1138. \
  1139. acc0 = asrl(acc0, 15); \
  1140. acc1 = asrl(acc1, 15); \
  1141. acc0 = __SSAT(acc0, 16); \
  1142. acc1 = __SSAT(acc1, 16); \
  1143. }
  1144. #define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
  1145. { \
  1146. q7_t const *pSrcX, *pSrcY; \
  1147. q7x16_t xVec, yVec; \
  1148. uint32_t k; \
  1149. \
  1150. pSrcX = (q7_t const *) pX; \
  1151. pSrcY = (q7_t const *) pY; \
  1152. k = count >> 4; \
  1153. while (k > 0U) \
  1154. { \
  1155. xVec = vld1q(pSrcX); pSrcX += 16; \
  1156. yVec = vldrbq_s8(&pSrcY[-1]); \
  1157. acc1 = vmladavaq(acc1, xVec, yVec); \
  1158. yVec = vld1q(pSrcY); pSrcY += 16; \
  1159. acc0 = vmladavaq(acc0, xVec, yVec); \
  1160. /* Decrement the loop counter */ \
  1161. k--; \
  1162. } \
  1163. k = count % 0x10U; \
  1164. /* use predication to finalize MAC sum */ \
  1165. /* acc1 requires 1 additional sample */ \
  1166. /* so add 1 to unmask an extra lane in final MAC computation */ \
  1167. mve_pred16_t p0 = vctp8q(k+1); \
  1168. xVec = vld1q(pSrcX); pSrcX += 16; \
  1169. yVec = vldrbq_s8(&pSrcY[-1]); \
  1170. acc1 = vmladavaq_p(acc1, xVec, yVec,p0); \
  1171. /* acc0 requires exact number of sample */ \
  1172. /* disable extra lanes in final MAC computation */ \
  1173. p0 = vctp8q(k); \
  1174. yVec = vld1q(pSrcY); pSrcY += 16; \
  1175. acc0 = vmladavaq_p(acc0, xVec, yVec,p0); \
  1176. \
  1177. acc0 = (acc0 >> 7); \
  1178. acc1 = (acc1 >> 7); \
  1179. acc0 = __SSAT(acc0, 8); \
  1180. acc1 = __SSAT(acc1, 8); \
  1181. }
  1182. #define MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count)\
  1183. { \
  1184. q7_t const *pSrcX, *pSrcY; \
  1185. q7x16_t xVec, yVec; \
  1186. uint32_t k; \
  1187. \
  1188. pSrcX = (q7_t const *) pX; \
  1189. pSrcY = (q7_t const *) pY; \
  1190. k = count >> 4; \
  1191. while (k > 0U) \
  1192. { \
  1193. xVec = vld1q(pSrcX); pSrcX += 16; \
  1194. yVec = vld1q(pSrcY); pSrcY += 16; \
  1195. acc = vmladavaq(acc, xVec, yVec); \
  1196. /* Decrement the loop counter */ \
  1197. k--; \
  1198. } \
  1199. /* tail predication expected here */ \
  1200. k = count % 0x10U; \
  1201. if (k > 0U) \
  1202. { \
  1203. mve_pred16_t p0 = vctp8q(k); \
  1204. xVec = vld1q(pSrcX); pSrcX += 16; \
  1205. yVec = vld1q(pSrcY); pSrcY += 16; \
  1206. acc = vmladavaq_p(acc, xVec, yVec, p0); \
  1207. } \
  1208. acc =(acc >> 7); \
  1209. acc = __SSAT(acc, 8); \
  1210. }
  1211. #define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count)\
  1212. { \
  1213. q7_t const *pSrcX, *pSrcY; \
  1214. q7x16_t xVec, yVec; \
  1215. uint32_t k; \
  1216. \
  1217. pSrcX = (q7_t const *) pX; \
  1218. pSrcY = (q7_t const *) pY; \
  1219. k = count >> 4; \
  1220. \
  1221. while (k > 0U) \
  1222. { \
  1223. yVec = vld1q(pSrcY); pSrcY += 16; \
  1224. xVec = vldrbq_s8(&pSrcX[1]); \
  1225. acc1 = vmladavaq(acc1, xVec, yVec); \
  1226. xVec = vldrbq_s8(&pSrcX[2]); \
  1227. acc2 = vmladavaq(acc2, xVec, yVec); \
  1228. xVec = vldrbq_s8(&pSrcX[3]); \
  1229. acc3 = vmladavaq(acc3, xVec, yVec); \
  1230. xVec = vld1q(pSrcX); pSrcX += 16; \
  1231. acc0 = vmladavaq(acc0, xVec, yVec); \
  1232. /* Decrement the loop counter */ \
  1233. k--; \
  1234. } \
  1235. /* loop + tail predication expected here */ \
  1236. k = count % 0x10U; \
  1237. if (k > 0U) \
  1238. { \
  1239. mve_pred16_t p0 = vctp8q(k); \
  1240. yVec = vld1q(pSrcY); pSrcY += 16; \
  1241. xVec = vldrbq_s8(&pSrcX[1]); \
  1242. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1243. xVec = vldrbq_s8(&pSrcX[2]); \
  1244. acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \
  1245. xVec = vldrbq_s8(&pSrcX[3]); \
  1246. acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \
  1247. xVec = vld1q(pSrcX); pSrcX += 16; \
  1248. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1249. } \
  1250. \
  1251. acc0 = (acc0 >> 7); \
  1252. acc1 = (acc1 >> 7); \
  1253. acc2 = (acc2 >> 7); \
  1254. acc3 = (acc3 >> 7); \
  1255. acc0 = __SSAT(acc0, 8); \
  1256. acc1 = __SSAT(acc1, 8); \
  1257. acc2 = __SSAT(acc2, 8); \
  1258. acc3 = __SSAT(acc3, 8); \
  1259. }
  1260. #define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count)\
  1261. { \
  1262. q7_t const *pSrcX, *pSrcY; \
  1263. q7x16_t xVec, yVec; \
  1264. uint32_t k; \
  1265. \
  1266. pSrcX = (q7_t const *) pX; \
  1267. pSrcY = (q7_t const *) pY; \
  1268. k = count >> 4; \
  1269. \
  1270. while (k > 0U) \
  1271. { \
  1272. yVec = vld1q(pSrcY); pSrcY += 16; \
  1273. xVec = vldrbq_s8(&pSrcX[1]); \
  1274. acc1 = vmladavaq(acc1, xVec, yVec); \
  1275. xVec = vld1q(pSrcX); pSrcX += 16; \
  1276. acc0 = vmladavaq(acc0, xVec, yVec); \
  1277. /* Decrement the loop counter */ \
  1278. k--; \
  1279. } \
  1280. /* loop + tail predication expected here */ \
  1281. k = count % 0x10U; \
  1282. if (k > 0U) \
  1283. { \
  1284. mve_pred16_t p0 = vctp8q(k); \
  1285. yVec = vld1q(pSrcY); pSrcY += 16; \
  1286. xVec = vldrbq_s8(&pSrcX[1]); \
  1287. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1288. xVec = vld1q(pSrcX); pSrcX += 16; \
  1289. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1290. } \
  1291. \
  1292. acc0 = (acc0 >> 7); \
  1293. acc1 = (acc1 >> 7); \
  1294. acc0 = __SSAT(acc0, 8); \
  1295. acc1 = __SSAT(acc1, 8); \
  1296. }
  1297. #define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count)\
  1298. { \
  1299. q7_t const *pSrcX, *pSrcY; \
  1300. q7x16_t xVec, yVec; \
  1301. uint32_t k; \
  1302. \
  1303. pSrcX = (q7_t const *) pX; \
  1304. pSrcY = (q7_t const *) pY; \
  1305. k = (count-1) >> 4; \
  1306. \
  1307. while (k > 0U) \
  1308. { \
  1309. yVec = vld1q(pSrcY); pSrcY += 16; \
  1310. xVec = vldrbq_s8(&pSrcX[1]); \
  1311. acc1 = vmladavaq(acc1, xVec, yVec); \
  1312. xVec = vld1q(pSrcX); pSrcX += 16; \
  1313. acc0 = vmladavaq(acc0, xVec, yVec); \
  1314. /* Decrement the loop counter */ \
  1315. k--; \
  1316. } \
  1317. /* use predication to finalize MAC sum */ \
  1318. /* acc1 requires exact number of sample (count-1) */ \
  1319. /* disable extra lanes in final MAC computation */ \
  1320. k = (count-1) % 0x10U; \
  1321. mve_pred16_t p0 = vctp8q(k); \
  1322. yVec = vld1q(pSrcY); pSrcY += 16; \
  1323. xVec = vldrbq_s8(&pSrcX[1]); \
  1324. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1325. /* acc0 requires 1 additional sample (count) */ \
  1326. /* so add 1 to unmask an extra lane in final MAC computation */ \
  1327. p0 = vctp8q(k+1); \
  1328. xVec = vld1q(pSrcX); pSrcX += 16; \
  1329. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1330. \
  1331. acc0 = (acc0 >> 7); \
  1332. acc1 = (acc1 >> 7); \
  1333. acc0 = __SSAT(acc0, 8); \
  1334. acc1 = __SSAT(acc1, 8); \
  1335. }
  1336. #define MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count)\
  1337. { \
  1338. q7_t const *pSrcX; \
  1339. const q7_t *pY1 = pY + 1; \
  1340. q7x16_t xVec, yVec; \
  1341. uint32_t k; \
  1342. \
  1343. pSrcX = (q7_t const *) pX; \
  1344. k = count >> 4; \
  1345. \
  1346. while (k > 0U) \
  1347. { \
  1348. xVec = vld1q(pSrcX); pSrcX += 16; \
  1349. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1350. pY-=16; \
  1351. acc0 = vmladavaq(acc0, xVec, yVec); \
  1352. yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \
  1353. pY1-=16; \
  1354. acc1 = vmladavaq(acc1, xVec, yVec); \
  1355. /* Decrement the loop counter */ \
  1356. k--; \
  1357. } \
  1358. k = count % 0x10U; \
  1359. /* use predication to finalize MAC sum */ \
  1360. /* acc0 requires exact number of sample */ \
  1361. /* disable extra lanes in final MAC computation */ \
  1362. mve_pred16_t p0 = vctp8q(k); \
  1363. xVec = vld1q(pSrcX); pSrcX += 16; \
  1364. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1365. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1366. yVec = vldrbq_gather_offset_s8(pY1, decrIdxVec); \
  1367. /* acc1 requires 1 additional sample */ \
  1368. /* so add 1 to unmask an extra lane in final MAC computation */ \
  1369. p0 = vctp8q(k+1); \
  1370. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1371. \
  1372. acc0 = (acc0 >> 7); \
  1373. acc1 = (acc1 >> 7); \
  1374. acc0 = __SSAT(acc0, 8); \
  1375. acc1 = __SSAT(acc1, 8); \
  1376. }
  1377. #define MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count) \
  1378. { \
  1379. q7_t const *pSrcX; \
  1380. q7x16_t xVec, yVec; \
  1381. uint32_t k; \
  1382. \
  1383. pSrcX = (q7_t const *) pX; \
  1384. k = count >> 4; \
  1385. \
  1386. while (k > 0U) \
  1387. { \
  1388. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1389. pY-=16; \
  1390. xVec = vld1q(pSrcX); pSrcX += 16; \
  1391. acc = vmladavaq(acc, xVec, yVec); \
  1392. /* Decrement the loop counter */ \
  1393. k--; \
  1394. } \
  1395. /* Loop with tail predication expected here */ \
  1396. k = count % 0x10U; \
  1397. if (k > 0U) \
  1398. { \
  1399. mve_pred16_t p0 = vctp8q(k); \
  1400. xVec = vld1q(pSrcX); pSrcX += 16; \
  1401. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1402. acc = vmladavaq_p(acc, xVec, yVec, p0); \
  1403. } \
  1404. acc = __SSAT(acc >> 7, 8); \
  1405. }
  1406. #define MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count) \
  1407. { \
  1408. q7_t const *pSrcX; \
  1409. q7x16_t xVec, yVec; \
  1410. uint32_t k; \
  1411. \
  1412. pSrcX = (q7_t const *) pX; \
  1413. k = count >> 4; \
  1414. \
  1415. while (k > 0U) \
  1416. { \
  1417. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1418. pY-=16; \
  1419. xVec = vldrbq_s8(&pSrcX[1]); \
  1420. acc1 = vmladavaq(acc1, xVec, yVec); \
  1421. xVec = vldrbq_s8(&pSrcX[2]); \
  1422. acc2 = vmladavaq(acc2, xVec, yVec); \
  1423. xVec = vldrbq_s8(&pSrcX[3]); \
  1424. acc3 = vmladavaq(acc3, xVec, yVec); \
  1425. xVec = vld1q(pSrcX); pSrcX += 16; \
  1426. acc0 = vmladavaq(acc0, xVec, yVec); \
  1427. /* Decrement the loop counter */ \
  1428. k--; \
  1429. } \
  1430. /* Loop with tail predication expected here */ \
  1431. k = count % 0x10U; \
  1432. if (k > 0U) \
  1433. { \
  1434. mve_pred16_t p0 = vctp8q(k); \
  1435. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1436. xVec = vldrbq_s8(&pSrcX[1]); \
  1437. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1438. xVec = vldrbq_s8(&pSrcX[2]); \
  1439. acc2 = vmladavaq_p(acc2, xVec, yVec, p0); \
  1440. xVec = vldrbq_s8(&pSrcX[3]); \
  1441. acc3 = vmladavaq_p(acc3, xVec, yVec, p0); \
  1442. xVec = vld1q(pSrcX); pSrcX += 16; \
  1443. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1444. } \
  1445. acc0 = __SSAT(acc0 >> 7, 8); \
  1446. acc1 = __SSAT(acc1 >> 7, 8); \
  1447. acc2 = __SSAT(acc2 >> 7, 8); \
  1448. acc3 = __SSAT(acc3 >> 7, 8); \
  1449. }
  1450. #define MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count) \
  1451. { \
  1452. q7_t const *pSrcX; \
  1453. q7x16_t xVec, yVec; \
  1454. uint32_t k; \
  1455. \
  1456. pSrcX = (q7_t const *) pX; \
  1457. k = count >> 4; \
  1458. \
  1459. while (k > 0U) \
  1460. { \
  1461. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1462. pY-=16; \
  1463. xVec = vldrbq_s8(&pSrcX[1]); \
  1464. acc1 = vmladavaq(acc1, xVec, yVec); \
  1465. xVec = vld1q(pSrcX); pSrcX += 16; \
  1466. acc0 = vmladavaq(acc0, xVec, yVec); \
  1467. /* Decrement the loop counter */ \
  1468. k--; \
  1469. } \
  1470. /* Loop with tail predication expected here */ \
  1471. k = count % 0x10U; \
  1472. if (k > 0U) \
  1473. { \
  1474. mve_pred16_t p0 = vctp8q(k); \
  1475. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1476. xVec = vldrbq_s8(&pSrcX[1]); \
  1477. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1478. xVec = vld1q(pSrcX); pSrcX += 16; \
  1479. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1480. } \
  1481. acc0 = __SSAT(acc0 >> 7, 8); \
  1482. acc1 = __SSAT(acc1 >> 7, 8); \
  1483. }
  1484. #define MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count) \
  1485. { \
  1486. q7_t const *pSrcX; \
  1487. q7x16_t xVec, yVec; \
  1488. uint32_t k; \
  1489. \
  1490. pSrcX = (q7_t const *) pX; \
  1491. k = (count-1) >> 4; \
  1492. \
  1493. while (k > 0U) \
  1494. { \
  1495. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1496. pY-=16; \
  1497. xVec = vldrbq_s8(&pSrcX[1]); \
  1498. acc1 = vmladavaq(acc1, xVec, yVec); \
  1499. xVec = vld1q(pSrcX); pSrcX += 16; \
  1500. acc0 = vmladavaq(acc0, xVec, yVec); \
  1501. /* Decrement the loop counter */ \
  1502. k--; \
  1503. } \
  1504. k = (count - 1) % 0x10U; \
  1505. /* use predication to finalize MAC sum */ \
  1506. /* acc1 requires exact number of sample (count-1) */ \
  1507. /* disable extra lanes in final MAC computation */ \
  1508. mve_pred16_t p0 = vctp8q(k); \
  1509. yVec = vldrbq_gather_offset_s8(pY, decrIdxVec); \
  1510. xVec = vldrbq_s8(&pSrcX[1]); \
  1511. acc1 = vmladavaq_p(acc1, xVec, yVec, p0); \
  1512. /* acc0 requires 1 additional sample (count) */ \
  1513. /* so add 1 to unmask an extra lane in final MAC computation */ \
  1514. p0 = vctp8q(k+1); \
  1515. xVec = vld1q(pSrcX); pSrcX += 16; \
  1516. acc0 = vmladavaq_p(acc0, xVec, yVec, p0); \
  1517. \
  1518. acc0 = (acc0 >> 7); \
  1519. acc1 = (acc1 >> 7); \
  1520. acc0 = __SSAT(acc0, 8); \
  1521. acc1 = __SSAT(acc1, 8); \
  1522. }
  1523. #endif /* (defined(ARM_MATH_MVEI) || defined(ARM_MATH_HELIUM)) */
  1524. #ifdef __cplusplus
  1525. }
  1526. #endif
  1527. #endif /* _ARM_VEC_FILTERING_H_ */