|
|
@@ -146,10 +146,12 @@ void arm_svm_linear_predict_f16(
|
|
|
/*
|
|
|
* Sum the partial parts
|
|
|
*/
|
|
|
- sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
|
|
|
- sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc1);
|
|
|
- sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc2);
|
|
|
- sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc3);
|
|
|
+ acc0 = vmulq_n_f16(acc0,*pDualCoef++);
|
|
|
+ acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
|
|
|
+ acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
|
|
|
+ acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
|
|
|
+
|
|
|
+ sum += vecAddAcrossF16Mve(acc0);
|
|
|
|
|
|
pSrcA += numCols * 4;
|
|
|
/*
|
|
|
@@ -216,8 +218,10 @@ void arm_svm_linear_predict_f16(
|
|
|
/*
|
|
|
* Sum the partial parts
|
|
|
*/
|
|
|
- sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
|
|
|
- sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc1);
|
|
|
+ acc0 = vmulq_n_f16(acc0,*pDualCoef++);
|
|
|
+ acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
|
|
|
+
|
|
|
+ sum += vecAddAcrossF16Mve(acc0);
|
|
|
|
|
|
pSrcA += numCols * 2;
|
|
|
row -= 2;
|