debug_mat.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738
  1. void pmat(float32_t *p,int nbrows,int nbcols)
  2. {
  3. for(int r=0;r<nbrows;r++)
  4. {
  5. for(int c=0;c<nbcols;c++)
  6. {
  7. printf("%f ",(double)p[c+r*nbcols]);
  8. }
  9. printf("\r\n");
  10. }
  11. printf("\r\n");
  12. }
  13. void pvec(float32_t *p,int nb)
  14. {
  15. for(int c=0;c<nb;c++)
  16. {
  17. printf("%f ",(double)p[c]);
  18. }
  19. printf("\r\n");
  20. }
  21. void pvec(Q7 *p,int nb)
  22. {
  23. for(int c=0;c<nb;c++)
  24. {
  25. printf("%f ",(double)(1.0f*p[c].v/128.0f));
  26. }
  27. printf("\r\n");
  28. }
  29. #if !defined(ARM_MATH_AUTOVECTORIZE)
  30. #if defined(ARM_MATH_MVEF)
  31. arm_status _arm_mat_qr_f32(
  32. const arm_matrix_instance_f32 * pSrc,
  33. const float32_t threshold,
  34. arm_matrix_instance_f32 * pOutR,
  35. arm_matrix_instance_f32 * pOutQ,
  36. float32_t * pOutTau,
  37. float32_t *pTmpA,
  38. float32_t *pTmpB
  39. )
  40. {
  41. int32_t col=0;
  42. int32_t nb,pos;
  43. float32_t *pa,*pc;
  44. float32_t beta;
  45. float32_t *pv;
  46. float32_t *pdst;
  47. float32_t *p;
  48. if (pSrc->numRows < pSrc->numCols)
  49. {
  50. return(ARM_MATH_SIZE_MISMATCH);
  51. }
  52. memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
  53. pOutR->numCols = pSrc->numCols;
  54. pOutR->numRows = pSrc->numRows;
  55. p = pOutR->pData;
  56. pc = pOutTau;
  57. for(col=0 ; col < pSrc->numCols; col++)
  58. {
  59. int32_t j,k,blkCnt,blkCnt2;
  60. float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
  61. float32_t temp;
  62. float32x4_t v1,v2,vtemp;
  63. COPY_COL_F32(pOutR,col,col,pTmpA);
  64. beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
  65. *pc++ = beta;
  66. //pvec(pTmpA,pSrc->numRows-col);
  67. //pmat(p,pSrc->numRows-col,pSrc->numCols-col);
  68. pdst = pTmpB;
  69. /* v.T A(col:,col:) -> tmpb */
  70. pv = pTmpA;
  71. pa = p;
  72. temp = *pv;
  73. blkCnt = (pSrc->numCols-col) >> 2;
  74. while (blkCnt > 0)
  75. {
  76. v1 = vld1q_f32(pa);
  77. v2 = vmulq_n_f32(v1,temp);
  78. vst1q_f32(pdst,v2);
  79. pa += 4;
  80. pdst += 4;
  81. blkCnt--;
  82. }
  83. blkCnt = (pSrc->numCols-col) & 3;
  84. if (blkCnt > 0)
  85. {
  86. mve_pred16_t p0 = vctp32q(blkCnt);
  87. v1 = vld1q_f32(pa);
  88. v2 = vmulq_n_f32(v1,temp);
  89. vst1q_p_f32(pdst,v2,p0);
  90. pa += blkCnt;
  91. }
  92. pa += col;
  93. pv++;
  94. pdst = pTmpB;
  95. pa0 = pa;
  96. pa1 = pa0 + pSrc->numCols;
  97. pa2 = pa1 + pSrc->numCols;
  98. pa3 = pa2 + pSrc->numCols;
  99. /* Unrolled loop */
  100. blkCnt = (pSrc->numRows-col - 1) >> 2;
  101. k=1;
  102. while(blkCnt > 0)
  103. {
  104. vtemp=vld1q_f32(pv);
  105. blkCnt2 = (pSrc->numCols-col) >> 2;
  106. while (blkCnt2 > 0)
  107. {
  108. v1 = vld1q_f32(pdst);
  109. v2 = vld1q_f32(pa0);
  110. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
  111. v2 = vld1q_f32(pa1);
  112. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
  113. v2 = vld1q_f32(pa2);
  114. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
  115. v2 = vld1q_f32(pa3);
  116. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
  117. vst1q_f32(pdst,v1);
  118. pdst += 4;
  119. pa0 += 4;
  120. pa1 += 4;
  121. pa2 += 4;
  122. pa3 += 4;
  123. blkCnt2--;
  124. }
  125. blkCnt2 = (pSrc->numCols-col) & 3;
  126. if (blkCnt2 > 0)
  127. {
  128. mve_pred16_t p0 = vctp32q(blkCnt2);
  129. v1 = vld1q_f32(pdst);
  130. v2 = vld1q_f32(pa0);
  131. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,0));
  132. v2 = vld1q_f32(pa1);
  133. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,1));
  134. v2 = vld1q_f32(pa2);
  135. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,2));
  136. v2 = vld1q_f32(pa3);
  137. v1 = vfmaq_n_f32(v1,v2,vgetq_lane(vtemp,3));
  138. vst1q_p_f32(pdst,v1,p0);
  139. pa0 += blkCnt2;
  140. pa1 += blkCnt2;
  141. pa2 += blkCnt2;
  142. pa3 += blkCnt2;
  143. }
  144. pa0 += col + 3*pSrc->numCols;
  145. pa1 += col + 3*pSrc->numCols;
  146. pa2 += col + 3*pSrc->numCols;
  147. pa3 += col + 3*pSrc->numCols;
  148. pv += 4;
  149. pdst = pTmpB;
  150. k += 4;
  151. blkCnt--;
  152. }
  153. pa = pa0;
  154. for(;k<pSrc->numRows-col; k++)
  155. {
  156. temp = *pv;
  157. blkCnt2 = (pSrc->numCols-col) >> 2;
  158. while (blkCnt2 > 0)
  159. {
  160. v1 = vld1q_f32(pa);
  161. v2 = vld1q_f32(pdst);
  162. v2 = vfmaq_n_f32(v2,v1,temp);
  163. vst1q_f32(pdst,v2);
  164. pa += 4;
  165. pdst += 4;
  166. blkCnt2--;
  167. }
  168. blkCnt2 = (pSrc->numCols-col) & 3;
  169. if (blkCnt2 > 0)
  170. {
  171. mve_pred16_t p0 = vctp32q(blkCnt2);
  172. v1 = vld1q_f32(pa);
  173. v2 = vld1q_f32(pdst);
  174. v2 = vfmaq_n_f32(v2,v1,temp);
  175. vst1q_p_f32(pdst,v2,p0);
  176. pa += blkCnt2;
  177. }
  178. pa += col;
  179. pv++;
  180. pdst = pTmpB;
  181. }
  182. //pvec(pTmpB,pSrc->numCols-col);
  183. //printf("--\r\n");
  184. /* A(col:,col:) - beta v tmpb */
  185. pa = p;
  186. for(j=0;j<pSrc->numRows-col; j++)
  187. {
  188. float32_t f = -beta * pTmpA[j];
  189. ptemp = pTmpB;
  190. blkCnt2 = (pSrc->numCols-col) >> 2;
  191. while (blkCnt2 > 0)
  192. {
  193. v1 = vld1q_f32(pa);
  194. v2 = vld1q_f32(ptemp);
  195. v1 = vfmaq_n_f32(v1,v2,f);
  196. vst1q_f32(pa,v1);
  197. pa += 4;
  198. ptemp += 4;
  199. blkCnt2--;
  200. }
  201. blkCnt2 = (pSrc->numCols-col) & 3;
  202. if (blkCnt2 > 0)
  203. {
  204. mve_pred16_t p0 = vctp32q(blkCnt2);
  205. v1 = vld1q_f32(pa);
  206. v2 = vld1q_f32(ptemp);
  207. v1 = vfmaq_n_f32(v1,v2,f);
  208. vst1q_p_f32(pa,v1,p0);
  209. pa += blkCnt2;
  210. }
  211. pa += col;
  212. }
  213. /* Copy Householder reflectors into R matrix */
  214. pa = p + pOutR->numCols;
  215. for(k=0;k<pSrc->numRows-col-1; k++)
  216. {
  217. *pa = pTmpA[k+1];
  218. pa += pOutR->numCols;
  219. }
  220. p += 1 + pOutR->numCols;
  221. }
  222. /* Generate Q if requested by user matrix */
  223. if (pOutQ != NULL)
  224. {
  225. /* Initialize Q matrix to identity */
  226. memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
  227. pa = pOutQ->pData;
  228. for(col=0 ; col < pOutQ->numCols; col++)
  229. {
  230. *pa = 1.0f;
  231. pa += pOutQ->numCols+1;
  232. }
  233. nb = pOutQ->numRows - pOutQ->numCols + 1;
  234. pc = pOutTau + pOutQ->numCols - 1;
  235. for(col=0 ; col < pOutQ->numCols; col++)
  236. {
  237. int32_t j,k, blkCnt, blkCnt2;
  238. float32_t *pa0,*pa1,*pa2,*pa3,*ptemp;
  239. float32_t temp;
  240. float32x4_t v1,v2,vtemp;
  241. pos = pSrc->numRows - nb;
  242. p = pOutQ->pData + pos + pOutQ->numCols*pos ;
  243. COPY_COL_F32(pOutR,pos,pos,pTmpA);
  244. pTmpA[0] = 1.0f;
  245. pdst = pTmpB;
  246. /* v.T A(col:,col:) -> tmpb */
  247. pv = pTmpA;
  248. pa = p;
  249. temp = *pv;
  250. blkCnt2 = (pOutQ->numRows-pos) >> 2;
  251. while (blkCnt2 > 0)
  252. {
  253. v1 = vld1q_f32(pa);
  254. v1 = vmulq_n_f32(v1, temp);
  255. vst1q_f32(pdst,v1);
  256. pa += 4;
  257. pdst += 4;
  258. blkCnt2--;
  259. }
  260. blkCnt2 = (pOutQ->numRows-pos) & 3;
  261. if (blkCnt2 > 0)
  262. {
  263. mve_pred16_t p0 = vctp32q(blkCnt2);
  264. v1 = vld1q_f32(pa);
  265. v1 = vmulq_n_f32(v1, temp);
  266. vst1q_p_f32(pdst,v1,p0);
  267. pa += blkCnt2;
  268. }
  269. pa += pos;
  270. pv++;
  271. pdst = pTmpB;
  272. pa0 = pa;
  273. pa1 = pa0 + pOutQ->numRows;
  274. pa2 = pa1 + pOutQ->numRows;
  275. pa3 = pa2 + pOutQ->numRows;
  276. /* Unrolled loop */
  277. blkCnt = (pOutQ->numRows-pos - 1) >> 2;
  278. k=1;
  279. while(blkCnt > 0)
  280. {
  281. vtemp = vld1q_f32(pv);
  282. blkCnt2 = (pOutQ->numRows-pos) >> 2;
  283. while (blkCnt2 > 0)
  284. {
  285. v1 = vld1q_f32(pdst);
  286. v2 = vld1q_f32(pa0);
  287. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
  288. v2 = vld1q_f32(pa1);
  289. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
  290. v2 = vld1q_f32(pa2);
  291. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
  292. v2 = vld1q_f32(pa3);
  293. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
  294. vst1q_f32(pdst,v1);
  295. pa0 += 4;
  296. pa1 += 4;
  297. pa2 += 4;
  298. pa3 += 4;
  299. pdst += 4;
  300. blkCnt2--;
  301. }
  302. blkCnt2 = (pOutQ->numRows-pos) & 3;
  303. if (blkCnt2 > 0)
  304. {
  305. mve_pred16_t p0 = vctp32q(blkCnt2);
  306. v1 = vld1q_f32(pdst);
  307. v2 = vld1q_f32(pa0);
  308. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,0));
  309. v2 = vld1q_f32(pa1);
  310. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,1));
  311. v2 = vld1q_f32(pa2);
  312. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,2));
  313. v2 = vld1q_f32(pa3);
  314. v1 = vfmaq_n_f32(v1, v2, vgetq_lane(vtemp,3));
  315. vst1q_p_f32(pdst,v1,p0);
  316. pa0 += blkCnt2;
  317. pa1 += blkCnt2;
  318. pa2 += blkCnt2;
  319. pa3 += blkCnt2;
  320. }
  321. pa0 += pos + 3*pOutQ->numRows;
  322. pa1 += pos + 3*pOutQ->numRows;
  323. pa2 += pos + 3*pOutQ->numRows;
  324. pa3 += pos + 3*pOutQ->numRows;
  325. pv += 4;
  326. pdst = pTmpB;
  327. k += 4;
  328. blkCnt--;
  329. }
  330. pa = pa0;
  331. for(;k<pOutQ->numRows-pos; k++)
  332. {
  333. temp = *pv;
  334. blkCnt2 = (pOutQ->numRows-pos) >> 2;
  335. while (blkCnt2 > 0)
  336. {
  337. v1 = vld1q_f32(pdst);
  338. v2 = vld1q_f32(pa);
  339. v1 = vfmaq_n_f32(v1, v2, temp);
  340. vst1q_f32(pdst,v1);
  341. pdst += 4;
  342. pa += 4;
  343. blkCnt2--;
  344. }
  345. blkCnt2 = (pOutQ->numRows-pos) & 3;
  346. if (blkCnt2 > 0)
  347. {
  348. mve_pred16_t p0 = vctp32q(blkCnt2);
  349. v1 = vld1q_f32(pdst);
  350. v2 = vld1q_f32(pa);
  351. v1 = vfmaq_n_f32(v1, v2, temp);
  352. vst1q_p_f32(pdst,v1,p0);
  353. pa += blkCnt2;
  354. }
  355. pa += pos;
  356. pv++;
  357. pdst = pTmpB;
  358. }
  359. pa = p;
  360. beta = *pc--;
  361. for(j=0;j<pOutQ->numRows-pos; j++)
  362. {
  363. float32_t f = -beta * pTmpA[j];
  364. ptemp = pTmpB;
  365. blkCnt2 = (pOutQ->numCols-pos) >> 2;
  366. while (blkCnt2 > 0)
  367. {
  368. v1 = vld1q_f32(pa);
  369. v2 = vld1q_f32(ptemp);
  370. v1 = vfmaq_n_f32(v1,v2,f);
  371. vst1q_f32(pa,v1);
  372. pa += 4;
  373. ptemp += 4;
  374. blkCnt2--;
  375. }
  376. blkCnt2 = (pOutQ->numCols-pos) & 3;
  377. if (blkCnt2 > 0)
  378. {
  379. mve_pred16_t p0 = vctp32q(blkCnt2);
  380. v1 = vld1q_f32(pa);
  381. v2 = vld1q_f32(ptemp);
  382. v1 = vfmaq_n_f32(v1,v2,f);
  383. vst1q_p_f32(pa,v1,p0);
  384. pa += blkCnt2;
  385. }
  386. pa += pos;
  387. }
  388. nb++;
  389. }
  390. }
  391. arm_status status = ARM_MATH_SUCCESS;
  392. /* Return to application */
  393. return (status);
  394. }
  395. #endif /*#if !defined(ARM_MATH_MVEF)*/
  396. #endif /*#if !defined(ARM_MATH_AUTOVECTORIZE)*/
  397. #if (!defined(ARM_MATH_MVEF)) || defined(ARM_MATH_AUTOVECTORIZE)
  398. arm_status _arm_mat_qr_f32(
  399. const arm_matrix_instance_f32 * pSrc,
  400. const float32_t threshold,
  401. arm_matrix_instance_f32 * pOutR,
  402. arm_matrix_instance_f32 * pOutQ,
  403. float32_t * pOutTau,
  404. float32_t *pTmpA,
  405. float32_t *pTmpB
  406. )
  407. {
  408. int32_t col=0;
  409. int32_t nb,pos;
  410. float32_t *pa,*pc;
  411. float32_t beta;
  412. float32_t *pv;
  413. float32_t *pdst;
  414. float32_t *p;
  415. if (pSrc->numRows < pSrc->numCols)
  416. {
  417. return(ARM_MATH_SIZE_MISMATCH);
  418. }
  419. memcpy(pOutR->pData,pSrc->pData,pSrc->numCols * pSrc->numRows*sizeof(float32_t));
  420. pOutR->numCols = pSrc->numCols;
  421. pOutR->numRows = pSrc->numRows;
  422. p = pOutR->pData;
  423. pc = pOutTau;
  424. for(col=0 ; col < pSrc->numCols; col++)
  425. {
  426. int32_t i,j,k,blkCnt;
  427. float32_t *pa0,*pa1,*pa2,*pa3;
  428. COPY_COL_F32(pOutR,col,col,pTmpA);
  429. beta = arm_householder_f32(pTmpA,threshold,pSrc->numRows - col,pTmpA);
  430. *pc++ = beta;
  431. pdst = pTmpB;
  432. /* v.T A(col:,col:) -> tmpb */
  433. pv = pTmpA;
  434. pa = p;
  435. for(j=0;j<pSrc->numCols-col; j++)
  436. {
  437. *pdst++ = *pv * *pa++;
  438. }
  439. pa += col;
  440. pv++;
  441. pdst = pTmpB;
  442. pa0 = pa;
  443. pa1 = pa0 + pSrc->numCols;
  444. pa2 = pa1 + pSrc->numCols;
  445. pa3 = pa2 + pSrc->numCols;
  446. /* Unrolled loop */
  447. blkCnt = (pSrc->numRows-col - 1) >> 2;
  448. k=1;
  449. while(blkCnt > 0)
  450. {
  451. float32_t sum;
  452. for(j=0;j<pSrc->numCols-col; j++)
  453. {
  454. sum = *pdst;
  455. sum += pv[0] * *pa0++;
  456. sum += pv[1] * *pa1++;
  457. sum += pv[2] * *pa2++;
  458. sum += pv[3] * *pa3++;
  459. *pdst++ = sum;
  460. }
  461. pa0 += col + 3*pSrc->numCols;
  462. pa1 += col + 3*pSrc->numCols;
  463. pa2 += col + 3*pSrc->numCols;
  464. pa3 += col + 3*pSrc->numCols;
  465. pv += 4;
  466. pdst = pTmpB;
  467. k += 4;
  468. blkCnt--;
  469. }
  470. pa = pa0;
  471. for(;k<pSrc->numRows-col; k++)
  472. {
  473. for(j=0;j<pSrc->numCols-col; j++)
  474. {
  475. *pdst++ += *pv * *pa++;
  476. }
  477. pa += col;
  478. pv++;
  479. pdst = pTmpB;
  480. }
  481. /* A(col:,col:) - beta v tmpb */
  482. pa = p;
  483. for(j=0;j<pSrc->numRows-col; j++)
  484. {
  485. float32_t f = beta * pTmpA[j];
  486. for(i=0;i<pSrc->numCols-col; i++)
  487. {
  488. *pa = *pa - f * pTmpB[i] ;
  489. pa++;
  490. }
  491. pa += col;
  492. }
  493. /* Copy Householder reflectors into R matrix */
  494. pa = p + pOutR->numCols;
  495. for(k=0;k<pSrc->numRows-col-1; k++)
  496. {
  497. *pa = pTmpA[k+1];
  498. pa += pOutR->numCols;
  499. }
  500. p += 1 + pOutR->numCols;
  501. }
  502. /* Generate Q if requested by user matrix */
  503. if (pOutQ != NULL)
  504. {
  505. /* Initialize Q matrix to identity */
  506. memset(pOutQ->pData,0,sizeof(float32_t)*pOutQ->numRows*pOutQ->numRows);
  507. pa = pOutQ->pData;
  508. for(col=0 ; col < pOutQ->numCols; col++)
  509. {
  510. *pa = 1.0f;
  511. pa += pOutQ->numCols+1;
  512. }
  513. nb = pOutQ->numRows - pOutQ->numCols + 1;
  514. pc = pOutTau + pOutQ->numCols - 1;
  515. for(col=0 ; col < pOutQ->numCols; col++)
  516. {
  517. int32_t i,j,k, blkCnt;
  518. float32_t *pa0,*pa1,*pa2,*pa3;
  519. pos = pSrc->numRows - nb;
  520. p = pOutQ->pData + pos + pOutQ->numCols*pos ;
  521. COPY_COL_F32(pOutR,pos,pos,pTmpA);
  522. pTmpA[0] = 1.0f;
  523. pdst = pTmpB;
  524. /* v.T A(col:,col:) -> tmpb */
  525. pv = pTmpA;
  526. pa = p;
  527. for(j=0;j<pOutQ->numRows-pos; j++)
  528. {
  529. *pdst++ = *pv * *pa++;
  530. }
  531. pa += pos;
  532. pv++;
  533. pdst = pTmpB;
  534. pa0 = pa;
  535. pa1 = pa0 + pOutQ->numRows;
  536. pa2 = pa1 + pOutQ->numRows;
  537. pa3 = pa2 + pOutQ->numRows;
  538. /* Unrolled loop */
  539. blkCnt = (pOutQ->numRows-pos - 1) >> 2;
  540. k=1;
  541. while(blkCnt > 0)
  542. {
  543. float32_t sum;
  544. for(j=0;j<pOutQ->numRows-pos; j++)
  545. {
  546. sum = *pdst;
  547. sum += pv[0] * *pa0++;
  548. sum += pv[1] * *pa1++;
  549. sum += pv[2] * *pa2++;
  550. sum += pv[3] * *pa3++;
  551. *pdst++ = sum;
  552. }
  553. pa0 += pos + 3*pOutQ->numRows;
  554. pa1 += pos + 3*pOutQ->numRows;
  555. pa2 += pos + 3*pOutQ->numRows;
  556. pa3 += pos + 3*pOutQ->numRows;
  557. pv += 4;
  558. pdst = pTmpB;
  559. k += 4;
  560. blkCnt--;
  561. }
  562. pa = pa0;
  563. for(;k<pOutQ->numRows-pos; k++)
  564. {
  565. for(j=0;j<pOutQ->numRows-pos; j++)
  566. {
  567. *pdst++ += *pv * *pa++;
  568. }
  569. pa += pos;
  570. pv++;
  571. pdst = pTmpB;
  572. }
  573. pa = p;
  574. beta = *pc--;
  575. for(j=0;j<pOutQ->numRows-pos; j++)
  576. {
  577. float32_t f = beta * pTmpA[j];
  578. for(i=0;i<pOutQ->numCols-pos; i++)
  579. {
  580. *pa = *pa - f * pTmpB[i] ;
  581. pa++;
  582. }
  583. pa += pos;
  584. }
  585. nb++;
  586. }
  587. }
  588. arm_status status = ARM_MATH_SUCCESS;
  589. /* Return to application */
  590. return (status);
  591. }
  592. #endif /* end of test for Helium or Neon availability */