Kaynağa Gözat

CMSIS-DSP: Improved f16 scalar code.

Christophe Favergeon 5 yıl önce
ebeveyn
işleme
078c0b282d
38 değiştirilmiş dosya ile 369 ekleme ve 367 silme
  1. 1 1
      Include/arm_helium_utils.h
  2. 13 13
      Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c
  3. 1 1
      Source/ComplexMathFunctions/arm_cmplx_mag_f16.c
  4. 1 1
      Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c
  5. 2 2
      Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c
  6. 4 4
      Source/DistanceFunctions/arm_braycurtis_distance_f16.c
  7. 3 3
      Source/DistanceFunctions/arm_canberra_distance_f16.c
  8. 2 2
      Source/DistanceFunctions/arm_chebyshev_distance_f16.c
  9. 3 3
      Source/DistanceFunctions/arm_cityblock_distance_f16.c
  10. 5 4
      Source/DistanceFunctions/arm_euclidean_distance_f16.c
  11. 7 6
      Source/DistanceFunctions/arm_jensenshannon_distance_f16.c
  12. 2 2
      Source/DistanceFunctions/arm_minkowski_distance_f16.c
  13. 5 5
      Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c
  14. 4 4
      Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c
  15. 4 4
      Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c
  16. 26 26
      Source/FilteringFunctions/arm_correlate_f16.c
  17. 5 5
      Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
  18. 19 19
      Source/MatrixFunctions/arm_mat_inverse_f16.c
  19. 2 2
      Source/MatrixFunctions/arm_mat_mult_f16.c
  20. 3 3
      Source/SVMFunctions/arm_svm_linear_predict_f16.c
  21. 3 3
      Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
  22. 5 5
      Source/SVMFunctions/arm_svm_rbf_predict_f16.c
  23. 5 5
      Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
  24. 2 2
      Source/StatisticsFunctions/arm_entropy_f16.c
  25. 3 3
      Source/StatisticsFunctions/arm_kullback_leibler_f16.c
  26. 5 5
      Source/StatisticsFunctions/arm_logsumexp_f16.c
  27. 2 2
      Source/StatisticsFunctions/arm_power_f16.c
  28. 2 2
      Source/StatisticsFunctions/arm_rms_f16.c
  29. 3 3
      Testing/FrameworkSource/Error.cpp
  30. 6 6
      Testing/PatternGeneration/Support.py
  31. 100 100
      Testing/Patterns/DSP/Support/SupportF16/Inputs6_f16.txt
  32. 6 6
      Testing/Patterns/DSP/Support/SupportF16/Ref6_f16.txt
  33. 100 100
      Testing/Patterns/DSP/Support/SupportF16/Weights6_f16.txt
  34. 2 2
      Testing/Source/Tests/BIQUADF16.cpp
  35. 1 1
      Testing/Source/Tests/DistanceTestsF16.cpp
  36. 3 3
      Testing/Source/Tests/SupportTestsF16.cpp
  37. 7 7
      Testing/bench.txt
  38. 2 2
      Testing/bench_f16.txt

+ 1 - 1
Include/arm_helium_utils.h

@@ -72,7 +72,7 @@ __STATIC_FORCEINLINE float16_t vecAddAcrossF16Mve(float16x8_t in)
     in = vaddq_f16(tmpVec, in);
     tmpVec = (float16x8_t) vrev64q_s32((int32x4_t) in);
     in = vaddq_f16(tmpVec, in);
-    acc = vgetq_lane_f16(in, 0) + vgetq_lane_f16(in, 4);
+    acc = (_Float16)vgetq_lane_f16(in, 0) + (_Float16)vgetq_lane_f16(in, 4);
 
     return acc;
 }

+ 13 - 13
Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f16.c

@@ -67,7 +67,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
     const float16_t *pIn = in;
     float16_t       result;
     f16x8_t         vsigma;
-    float16_t       tmp;
+    _Float16       tmp;
     f16x8_t         vacc1, vacc2;
     uint32_t        index;
     float16_t       logclassPriors[S->numberOfClasses];
@@ -81,15 +81,15 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
     for (nbClass = 0; nbClass < S->numberOfClasses; nbClass++) {
         pIn = in;
 
-        vacc1 = vdupq_n_f16(0);
-        vacc2 = vdupq_n_f16(0);
+        vacc1 = vdupq_n_f16(0.0f16);
+        vacc2 = vdupq_n_f16(0.0f16);
 
         uint32_t         blkCnt =S->vectorDimension >> 3;
         while (blkCnt > 0U) {
             f16x8_t         vinvSigma, vtmp;
 
             vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
-            vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)));
+            vacc1 = vaddq(vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)));
 
             vinvSigma = vrecip_medprec_f16(vsigma);
 
@@ -112,7 +112,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
 
             vsigma = vaddq_n_f16(vld1q(pSigma), S->epsilon);
             vacc1 =
-                vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f * PI)), p0);
+                vaddq_m_f16(vacc1, vacc1, vlogq_f16(vmulq_n_f16(vsigma, 2.0f16 * (_Float16)PI)), p0);
 
             vinvSigma = vrecip_medprec_f16(vsigma);
 
@@ -126,8 +126,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
             pSigma += blkCnt;
         }
 
-        tmp = -0.5f * vecAddAcrossF16Mve(vacc1);
-        tmp -= 0.5f * vecAddAcrossF16Mve(vacc2);
+        tmp = -0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc1);
+        tmp -= 0.5f16 * (_Float16)vecAddAcrossF16Mve(vacc2);
 
         *buffer = tmp + *pLogPrior++;
         buffer++;
@@ -175,13 +175,13 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
         
         pIn = in;
 
-        tmp = 0.0;
-        acc1 = 0.0f;
-        acc2 = 0.0f;
+        tmp = 0.0f16;
+        acc1 = 0.0f16;
+        acc2 = 0.0f16;
         for(nbDim = 0; nbDim < S->vectorDimension; nbDim++)
         {
            sigma = *pSigma + S->epsilon;
-           acc1 += logf(2.0f * PI_F * sigma);
+           acc1 += logf(2.0f16 * (_Float16)PI_F * sigma);
            acc2 += (*pIn - *pTheta) * (*pIn - *pTheta) / sigma;
 
            pIn++;
@@ -189,8 +189,8 @@ uint32_t arm_gaussian_naive_bayes_predict_f16(const arm_gaussian_naive_bayes_ins
            pSigma++;
         }
 
-        tmp = -0.5f * acc1;
-        tmp -= 0.5f * acc2;
+        tmp = -0.5f16 * acc1;
+        tmp -= 0.5f16 * acc2;
 
 
         *buffer = tmp + logf(*pPrior++);

+ 1 - 1
Source/ComplexMathFunctions/arm_cmplx_mag_f16.c

@@ -172,7 +172,7 @@ void arm_cmplx_mag_f16(
         uint32_t numSamples)
 {
   uint32_t blkCnt;                               /* loop counter */
-  float16_t real, imag;                      /* Temporary variables to hold input values */
+  _Float16 real, imag;                      /* Temporary variables to hold input values */
 
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 

+ 1 - 1
Source/ComplexMathFunctions/arm_cmplx_mag_squared_f16.c

@@ -107,7 +107,7 @@ void arm_cmplx_mag_squared_f16(
         uint32_t numSamples)
 {
         uint32_t blkCnt;                               /* Loop counter */
-        float16_t real, imag;                          /* Temporary input variables */
+        _Float16 real, imag;                          /* Temporary input variables */
 
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 

+ 2 - 2
Source/ComplexMathFunctions/arm_cmplx_mult_cmplx_f16.c

@@ -103,7 +103,7 @@ void arm_cmplx_mult_cmplx_f16(
         pDst += 8;
     }
 
-    float16_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
+    _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
         /* Tail */
     blkCnt = (blockSize & 7) >> 1;
     while (blkCnt > 0)
@@ -134,7 +134,7 @@ void arm_cmplx_mult_cmplx_f16(
         uint32_t numSamples)
 {
     uint32_t blkCnt;                               /* Loop counter */
-    float16_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
+    _Float16 a, b, c, d;  /* Temporary variables to store real and imaginary values */
 
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 

+ 4 - 4
Source/DistanceFunctions/arm_braycurtis_distance_f16.c

@@ -72,7 +72,7 @@
 
 float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-    float16_t       accumDiff = 0.0f, accumSum = 0.0f;
+    _Float16        accumDiff = 0.0f, accumSum = 0.0f;
     uint32_t        blkCnt;
     f16x8_t         a, b, c, accumDiffV, accumSumV;
 
@@ -125,14 +125,14 @@ float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, u
 
 float16_t arm_braycurtis_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accumDiff=0.0f, accumSum=0.0f, tmpA, tmpB;
+   _Float16 accumDiff=0.0f16, accumSum=0.0f16, tmpA, tmpB;
 
    while(blockSize > 0)
    {
       tmpA = *pA++;
       tmpB = *pB++;
-      accumDiff += fabsf(tmpA - tmpB);
-      accumSum += fabsf(tmpA + tmpB);
+      accumDiff += (_Float16)fabsf(tmpA - tmpB);
+      accumSum += (_Float16)fabsf(tmpA + tmpB);
       blockSize --;
    }
    /*

+ 3 - 3
Source/DistanceFunctions/arm_canberra_distance_f16.c

@@ -70,7 +70,7 @@
 
 float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-    float16_t       accum = 0.0f;
+    _Float16       accum = 0.0f16;
     uint32_t         blkCnt;
     f16x8_t         a, b, c, accumV;
 
@@ -141,7 +141,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
 #else
 float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accum=0.0f, tmpA, tmpB,diff,sum;
+   _Float16 accum=0.0f, tmpA, tmpB,diff,sum;
 
    while(blockSize > 0)
    {
@@ -150,7 +150,7 @@ float16_t arm_canberra_distance_f16(const float16_t *pA,const float16_t *pB, uin
 
       diff = fabsf(tmpA - tmpB);
       sum = fabsf(tmpA) + fabsf(tmpB);
-      if ((tmpA != 0.0f) || (tmpB != 0.0f))
+      if ((tmpA != 0.0f16) || (tmpB != 0.0f16))
       {
          accum += (diff / sum);
       }

+ 2 - 2
Source/DistanceFunctions/arm_chebyshev_distance_f16.c

@@ -67,7 +67,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
     uint32_t        blkCnt;     /* loop counters */
     f16x8_t         vecA, vecB;
     f16x8_t         vecDiff = vdupq_n_f16(0.0);
-    float16_t       maxValue = 0.0;
+    float16_t       maxValue = 0.0f16;
 
 
     blkCnt = blockSize >> 3;
@@ -111,7 +111,7 @@ float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, ui
 #else
 float16_t arm_chebyshev_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t diff=0.0f,  maxVal,tmpA, tmpB;
+   _Float16 diff=0.0f,  maxVal,tmpA, tmpB;
 
    tmpA = *pA++;
    tmpB = *pB++;

+ 3 - 3
Source/DistanceFunctions/arm_cityblock_distance_f16.c

@@ -102,14 +102,14 @@ float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, ui
 #else
 float16_t arm_cityblock_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accum,tmpA, tmpB;
+   _Float16 accum,tmpA, tmpB;
 
-   accum = 0.0f;
+   accum = 0.0f16;
    while(blockSize > 0)
    {
       tmpA = *pA++;
       tmpB = *pB++;
-      accum  += fabsf(tmpA - tmpB);
+      accum  += (_Float16)fabsf(tmpA - tmpB);
       
       blockSize --;
    }

+ 5 - 4
Source/DistanceFunctions/arm_euclidean_distance_f16.c

@@ -105,16 +105,17 @@ float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, ui
 #else
 float16_t arm_euclidean_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-   float16_t accum=0.0f,tmp;
+   _Float16 accum=0.0f,tmp;
+   float16_t result;
 
    while(blockSize > 0)
    {
-      tmp = *pA++ - *pB++;
+      tmp = (_Float16)*pA++ - (_Float16)*pB++;
       accum += SQ(tmp);
       blockSize --;
    }
-   arm_sqrt_f16(accum,&tmp);
-   return(tmp);
+   arm_sqrt_f16(accum,&result);
+   return(result);
 }
 
 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */

+ 7 - 6
Source/DistanceFunctions/arm_jensenshannon_distance_f16.c

@@ -145,16 +145,17 @@ float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB
 
 float16_t arm_jensenshannon_distance_f16(const float16_t *pA,const float16_t *pB, uint32_t blockSize)
 {
-    float16_t left, right,sum, result, tmp;
+    _Float16 left, right,sum, tmp;
+    float16_t result;
     uint32_t i;
 
-    left = 0.0f; 
-    right = 0.0f;
+    left = 0.0f16; 
+    right = 0.0f16;
     for(i=0; i < blockSize; i++)
     {
-      tmp = (pA[i] + pB[i]) / 2.0f;
-      left  += rel_entr(pA[i], tmp);
-      right += rel_entr(pB[i], tmp);
+      tmp = ((_Float16)pA[i] + (_Float16)pB[i]) / 2.0f16;
+      left  += (_Float16)rel_entr(pA[i], tmp);
+      right += (_Float16)rel_entr(pB[i], tmp);
     }
 
 

+ 2 - 2
Source/DistanceFunctions/arm_minkowski_distance_f16.c

@@ -111,13 +111,13 @@ float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, in
 
 float16_t arm_minkowski_distance_f16(const float16_t *pA,const float16_t *pB, int32_t order, uint32_t blockSize)
 {
-    float16_t sum;
+    _Float16 sum;
     uint32_t i;
 
     sum = 0.0f; 
     for(i=0; i < blockSize; i++)
     {
-       sum += powf(fabsf(pA[i] - pB[i]),order);
+       sum += (_Float16)powf(fabsf(pA[i] - pB[i]),order);
     }
 
 

+ 5 - 5
Source/FilteringFunctions/arm_biquad_cascade_df1_f16.c

@@ -64,7 +64,7 @@ void arm_biquad_cascade_df1_f16(
     float16_t Xn1, Xn2, Yn1, Yn2;   /*  Filter pState variables   */
     float16_t X0, X1, X2, X3;   /*  temporary input           */
     float16_t X4, X5, X6, X7;   /*  temporary input           */
-    float16_t lastX, lastY;             /*  X,Y history for tail handling */
+    _Float16 lastX, lastY;             /*  X,Y history for tail handling */
     f16x8_t coeffs;
     f16x8_t accVec;           /* accumultor vector */
     uint32_t  sample, stage = S->numStages; /*  loop counters             */
@@ -320,10 +320,10 @@ void arm_biquad_cascade_df1_f16(
         float16_t *pOut = pDst;                        /* Destination pointer */
         float16_t *pState = S->pState;                 /* pState pointer */
   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
-        float16_t acc;                                 /* Accumulator */
-        float16_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
-        float16_t Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
-        float16_t Xn;                                  /* Temporary input */
+        _Float16 acc;                                 /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1, Xn2, Yn1, Yn2;                  /* Filter pState variables */
+        _Float16 Xn;                                  /* Temporary input */
         uint32_t sample, stage = S->numStages;         /* Loop counters */
 
   do

+ 4 - 4
Source/FilteringFunctions/arm_biquad_cascade_df2T_f16.c

@@ -197,10 +197,10 @@ void arm_biquad_cascade_df2T_f16(
         float16_t *pOut = pDst;                        /* Destination pointer */
         float16_t *pState = S->pState;                 /* State pointer */
   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
-        float16_t acc1;                                /* Accumulator */
-        float16_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
-        float16_t Xn1;                                 /* Temporary input */
-        float16_t d1, d2;                              /* State variables */
+        _Float16 acc1;                                /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1;                                 /* Temporary input */
+        _Float16 d1, d2;                              /* State variables */
         uint32_t sample, stage = S->numStages;         /* Loop counters */
 
   do

+ 4 - 4
Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f16.c

@@ -198,10 +198,10 @@ void arm_biquad_cascade_stereo_df2T_f16(
         float16_t *pOut = pDst;                        /* Destination pointer */
         float16_t *pState = S->pState;                 /* State pointer */
   const float16_t *pCoeffs = S->pCoeffs;               /* Coefficient pointer */
-        float16_t acc1a, acc1b;                        /* Accumulator */
-        float16_t b0, b1, b2, a1, a2;                  /* Filter coefficients */
-        float16_t Xn1a, Xn1b;                          /* Temporary input */
-        float16_t d1a, d2a, d1b, d2b;                  /* State variables */
+        _Float16 acc1a, acc1b;                        /* Accumulator */
+        _Float16 b0, b1, b2, a1, a2;                  /* Filter coefficients */
+        _Float16 Xn1a, Xn1b;                          /* Temporary input */
+        _Float16 d1a, d2a, d1b, d2b;                  /* State variables */
         uint32_t sample, stage = S->numStages;         /* Loop counters */
 
     do

+ 26 - 26
Source/FilteringFunctions/arm_correlate_f16.c

@@ -370,8 +370,8 @@ void arm_correlate_f16(
     for (i = 0U; i <= block1 - 2; i += 2)
     {
         uint32_t  count = i + 1;
-        float16_t acc0;
-        float16_t acc1;
+        _Float16 acc0;
+        _Float16 acc1;
         /*
          * compute 2 accumulators per loop
          * size is incrementing for second accumulator
@@ -390,7 +390,7 @@ void arm_correlate_f16(
     for (; i < block1; i++)
     {
         uint32_t  count = i + 1;
-        float16_t acc;
+        _Float16 acc;
 
         pX = pA;
         pY = pB;
@@ -403,10 +403,10 @@ void arm_correlate_f16(
 
     for (i = 0U; i <= block2 - 4; i += 4)
     {
-        float16_t acc0;
-        float16_t acc1;
-        float16_t acc2;
-        float16_t acc3;
+        _Float16 acc0;
+        _Float16 acc1;
+        _Float16 acc2;
+        _Float16 acc3;
 
         pX = pA;
         pY = pB;
@@ -430,8 +430,8 @@ void arm_correlate_f16(
 
     for (; i <= block2 - 2; i += 2)
     {
-        float16_t acc0;
-        float16_t acc1;
+        _Float16 acc0;
+        _Float16 acc1;
 
         pX = pA;
         pY = pB;
@@ -451,7 +451,7 @@ void arm_correlate_f16(
 
     if (block2 & 1)
     {
-        float16_t acc;
+        _Float16 acc;
 
         pX = pA;
         pY = pB;
@@ -466,8 +466,8 @@ void arm_correlate_f16(
     {
 
         uint32_t  count = (i + 1);
-        float16_t acc0;
-        float16_t acc1;
+        _Float16 acc0;
+        _Float16 acc1;
 
         pX = pA;
         pY = pB;
@@ -488,7 +488,7 @@ void arm_correlate_f16(
     for (; i >= 0; i--)
     {
         uint32_t  count = (i + 1);
-        float16_t acc;
+        _Float16 acc;
 
         pX = pA;
         pY = pB;
@@ -517,15 +517,15 @@ void arm_correlate_f16(
   const float16_t *px;                                 /* Intermediate inputA pointer */
   const float16_t *py;                                 /* Intermediate inputB pointer */
   const float16_t *pSrc1;
-        float16_t sum;
+        _Float16 sum;
         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
         uint32_t j, k, count, blkCnt;                  /* Loop counters */
         uint32_t outBlockSize;                         /* Loop counter */
         int32_t inc = 1;                               /* Destination address modifier */
 
 #if defined (ARM_MATH_LOOPUNROLL) 
-    float16_t acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
-    float16_t x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
+    _Float16 acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
+    _Float16 x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
 #endif
 
   /* The algorithm implementation is based on the lengths of the inputs. */
@@ -625,7 +625,7 @@ void arm_correlate_f16(
   while (blockSize1 > 0U)
   {
     /* Accumulator is made zero for every iteration */
-    sum = 0.0f;
+    sum = 0.0f16;
 
 #if defined (ARM_MATH_LOOPUNROLL) 
 
@@ -725,10 +725,10 @@ void arm_correlate_f16(
     while (blkCnt > 0U)
     {
       /* Set all accumulators to zero */
-      acc0 = 0.0f;
-      acc1 = 0.0f;
-      acc2 = 0.0f;
-      acc3 = 0.0f;
+      acc0 = 0.0f16;
+      acc1 = 0.0f16;
+      acc2 = 0.0f16;
+      acc3 = 0.0f16;
 
 
       /* read x[0], x[1], x[2] samples */
@@ -873,7 +873,7 @@ void arm_correlate_f16(
     while (blkCnt > 0U)
     {
       /* Accumulator is made zero for every iteration */
-      sum = 0.0f;
+      sum = 0.0f16;
 
 #if defined (ARM_MATH_LOOPUNROLL) 
 
@@ -939,7 +939,7 @@ void arm_correlate_f16(
     while (blkCnt > 0U)
     {
       /* Accumulator is made zero for every iteration */
-      sum = 0.0f;
+      sum = 0.0f16;
 
       /* Loop over srcBLen */
       k = srcBLen;
@@ -1000,7 +1000,7 @@ void arm_correlate_f16(
   while (blockSize3 > 0U)
   {
     /* Accumulator is made zero for every iteration */
-    sum = 0.0f;
+    sum = 0.0f16;
 
 #if defined (ARM_MATH_LOOPUNROLL) 
 
@@ -1069,7 +1069,7 @@ void arm_correlate_f16(
 
   const float16_t *pIn1 = pSrcA;                       /* inputA pointer */
   const float16_t *pIn2 = pSrcB + (srcBLen - 1U);      /* inputB pointer */
-        float16_t sum;                                 /* Accumulator */
+        _Float16 sum;                                 /* Accumulator */
         uint32_t i = 0U, j;                            /* Loop counters */
         uint32_t inv = 0U;                             /* Reverse order flag */
         uint32_t tot = 0U;                             /* Length */
@@ -1127,7 +1127,7 @@ void arm_correlate_f16(
   for (i = 0U; i <= tot; i++)
   {
     /* Initialize sum with zero to carry out MAC operations */
-    sum = 0.0f;
+    sum = 0.0f16;
 
     /* Loop to perform MAC operations according to convolution equation */
     for (j = 0U; j <= i; j++)

+ 5 - 5
Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c

@@ -711,13 +711,13 @@ arm_status arm_mat_cmplx_mult_f16(
   uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
   uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
   uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
-  float16_t sumReal, sumImag;                    /* Accumulator */
-  float16_t a1, b1, c1, d1;
+  _Float16 sumReal, sumImag;                    /* Accumulator */
+  _Float16 a1, b1, c1, d1;
   uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
   arm_status status;                             /* status of matrix multiplication */
 
 #if defined (ARM_MATH_LOOPUNROLL)
-  float16_t a0, b0, c0, d0;
+  _Float16 a0, b0, c0, d0;
 #endif
 
 #ifdef ARM_MATH_MATRIX_CHECK
@@ -755,8 +755,8 @@ arm_status arm_mat_cmplx_mult_f16(
       do
       {
         /* Set the variable sum, that acts as accumulator, to zero */
-        sumReal = 0.0f;
-        sumImag = 0.0f;
+        sumReal = 0.0f16;
+        sumImag = 0.0f16;
 
         /* Initiate pointer pIn1 to point to starting address of column being processed */
         pIn1 = pInA;

+ 19 - 19
Source/MatrixFunctions/arm_mat_inverse_f16.c

@@ -66,7 +66,7 @@ arm_status arm_mat_inverse_f16(
     uint32_t  numCols = pSrc->numCols;  /* Number of Cols in the matrix  */
     float16_t *pTmpA, *pTmpB;
 
-    float16_t in = 0.0f;        /* Temporary input values  */
+    _Float16 in = 0.0f16;        /* Temporary input values  */
     uint32_t  i, rowCnt, flag = 0U, j, loopCnt, k, l;   /* loop counters */
     arm_status status;          /* status of matrix inverse */
     uint32_t  blkCnt;
@@ -137,20 +137,20 @@ arm_status arm_mat_inverse_f16(
             j = numRows - rowCnt;
             while (j > 0U)
             {
-                *pOutT1++ = 0.0f;
+                *pOutT1++ = 0.0f16;
                 j--;
             }
             /*
              * Writing all ones in the diagonal of the destination matrix
              */
-            *pOutT1++ = 1.0f;
+            *pOutT1++ = 1.0f16;
             /*
              * Writing all zeroes in upper triangle of the destination matrix
              */
             j = rowCnt - 1U;
             while (j > 0U)
             {
-                *pOutT1++ = 0.0f;
+                *pOutT1++ = 0.0f16;
                 j--;
             }
             /*
@@ -199,7 +199,7 @@ arm_status arm_mat_inverse_f16(
             /*
              * Check if the pivot element is zero
              */
-            if (*pInT1 == 0.0f)
+            if (*pInT1 == 0.0f16)
             {
                 /*
                  * Loop over the number rows present below
@@ -215,7 +215,7 @@ arm_status arm_mat_inverse_f16(
                      * Check if there is a non zero pivot element to
                      * * replace in the rows below
                      */
-                    if (*pInT2 != 0.0f)
+                    if (*pInT2 != 0.0f16)
                     {
                         f16x8_t vecA, vecB;
                         /*
@@ -310,7 +310,7 @@ arm_status arm_mat_inverse_f16(
             /*
              * Update the status if the matrix is singular
              */
-            if ((flag != 1U) && (in == 0.0f))
+            if ((flag != 1U) && (in == 0.0f16))
             {
                 return ARM_MATH_SINGULAR;
             }
@@ -334,7 +334,7 @@ arm_status arm_mat_inverse_f16(
 
             pTmpA = pInT1;
 
-            f16x8_t invIn = vdupq_n_f16(1.0f / in);
+            f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
 
             blkCnt = (numCols - l) >> 3;
             f16x8_t vecA;
@@ -537,12 +537,12 @@ arm_status arm_mat_inverse_f16(
          */
         status = ARM_MATH_SUCCESS;
 
-        if ((flag != 1U) && (in == 0.0f))
+        if ((flag != 1U) && (in == 0.0f16))
         {
             pIn = pSrc->pData;
             for (i = 0; i < numRows * numCols; i++)
             {
-                if (pIn[i] != 0.0f)
+                if (pIn[i] != 0.0f16)
                     break;
             }
 
@@ -568,7 +568,7 @@ arm_status arm_mat_inverse_f16(
   uint32_t numRows = pSrc->numRows;              /* Number of rows in the matrix  */
   uint32_t numCols = pSrc->numCols;              /* Number of Cols in the matrix  */
 
-  float16_t Xchg, in = 0.0f, in1;                /* Temporary input values  */
+  _Float16 Xchg, in = 0.0f16, in1;                /* Temporary input values  */
   uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l;      /* loop counters */
   arm_status status;                             /* status of matrix inverse */
 
@@ -636,18 +636,18 @@ arm_status arm_mat_inverse_f16(
       j = numRows - rowCnt;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0f;
+        *pOutT1++ = 0.0f16;
         j--;
       }
 
       /* Writing all ones in the diagonal of the destination matrix */
-      *pOutT1++ = 1.0f;
+      *pOutT1++ = 1.0f16;
 
       /* Writing all zeroes in upper triangle of the destination matrix */
       j = rowCnt - 1U;
       while (j > 0U)
       {
-        *pOutT1++ = 0.0f;
+        *pOutT1++ = 0.0f16;
         j--;
       }
 
@@ -685,7 +685,7 @@ arm_status arm_mat_inverse_f16(
       k = 1U;
 
       /* Check if the pivot element is zero */
-      if (*pInT1 == 0.0f)
+      if (*pInT1 == 0.0f16)
       {
         /* Loop over the number rows present below */
 
@@ -697,7 +697,7 @@ arm_status arm_mat_inverse_f16(
 
           /* Check if there is a non zero pivot element to
            * replace in the rows below */
-          if (*pInT2 != 0.0f)
+          if (*pInT2 != 0.0f16)
           {
             /* Loop over number of columns
              * to the right of the pilot element */
@@ -743,7 +743,7 @@ arm_status arm_mat_inverse_f16(
       }
 
       /* Update the status if the matrix is singular */
-      if ((flag != 1U) && (in == 0.0f))
+      if ((flag != 1U) && (in == 0.0f16))
       {
         return ARM_MATH_SINGULAR;
       }
@@ -877,12 +877,12 @@ arm_status arm_mat_inverse_f16(
     /* Set status as ARM_MATH_SUCCESS */
     status = ARM_MATH_SUCCESS;
 
-    if ((flag != 1U) && (in == 0.0f))
+    if ((flag != 1U) && (in == 0.0f16))
     {
       pIn = pSrc->pData;
       for (i = 0; i < numRows * numCols; i++)
       {
-        if (pIn[i] != 0.0f)
+        if (pIn[i] != 0.0f16)
             break;
       }
 

+ 2 - 2
Source/MatrixFunctions/arm_mat_mult_f16.c

@@ -632,7 +632,7 @@ arm_status arm_mat_mult_f16(
   float16_t *pInB = pSrcB->pData;                /* Input data matrix pointer B */
   float16_t *pOut = pDst->pData;                 /* Output data matrix pointer */
   float16_t *px;                                 /* Temporary output data matrix pointer */
-  float16_t sum;                                 /* Accumulator */
+  _Float16 sum;                                 /* Accumulator */
   uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
   uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
   uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
@@ -671,7 +671,7 @@ arm_status arm_mat_mult_f16(
       do
       {
         /* Set the variable sum, that acts as accumulator, to zero */
-        sum = 0.0f;
+        sum = 0.0f16;
 
         /* Initialize pointer pIn1 to point to starting address of column being processed */
         pIn1 = pInA;

+ 3 - 3
Source/SVMFunctions/arm_svm_linear_predict_f16.c

@@ -151,7 +151,7 @@ void arm_svm_linear_predict_f16(
         acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
         acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
 
-        sum += vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);
 
         pSrcA += numCols * 4;
         /*
@@ -221,7 +221,7 @@ void arm_svm_linear_predict_f16(
         acc0 = vmulq_n_f16(acc0,*pDualCoef++);
         acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
 
-        sum += vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)vecAddAcrossF16Mve(acc0);
 
         pSrcA += numCols * 2;
         row -= 2;
@@ -274,7 +274,7 @@ void arm_svm_linear_predict_f16(
         /*
          * Sum the partial parts
          */
-        sum += (_Float16)*pDualCoef++ * vecAddAcrossF16Mve(acc0);
+        sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
 
     }
 

+ 3 - 3
Source/SVMFunctions/arm_svm_polynomial_predict_f16.c

@@ -294,7 +294,7 @@ void arm_svm_polynomial_predict_f16(
                              (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree), 
                              vctp16q(1));
     }
-    sum += vecAddAcrossF16Mve(vSum);
+    sum += (_Float16)vecAddAcrossF16Mve(vSum);
 
     
     *pResult = S->classes[STEP(sum)];
@@ -316,9 +316,9 @@ void arm_svm_polynomial_predict_f16(
         dot=0;
         for(j=0; j < S->vectorDimension; j++)
         {
-            dot = dot + in[j]* *pSupport++;
+            dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
         }
-        sum += S->dualCoefficients[i] * arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
+        sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
     }
 
     *pResult=S->classes[STEP(sum)];

+ 5 - 5
Source/SVMFunctions/arm_svm_rbf_predict_f16.c

@@ -67,7 +67,7 @@ void arm_svm_rbf_predict_f16(
     uint32_t         row;
     uint32_t         blkCnt;     /* loop counters */
     const float16_t *pDualCoef = S->dualCoefficients;
-    float16_t       sum = S->intercept;
+    _Float16       sum = S->intercept;
     f16x8_t         vSum = vdupq_n_f16(0);
 
     row = numRows;
@@ -323,19 +323,19 @@ void arm_svm_rbf_predict_f16(
     int32_t * pResult)
 {
     _Float16 sum=S->intercept;
-    _Float16 dot=0;
+    _Float16 dot=00.f16;
     uint32_t i,j;
     const float16_t *pSupport = S->supportVectors;
 
     for(i=0; i < S->nbOfSupportVectors; i++)
     {
-        dot=0;
+        dot=0.0f16;
         for(j=0; j < S->vectorDimension; j++)
         {
-            dot = dot + SQ(in[j] - *pSupport);
+            dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
             pSupport++;
         }
-        sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf(-(_Float16)S->gamma * dot);
     }
     *pResult=S->classes[STEP(sum)];
 }

+ 5 - 5
Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c

@@ -67,7 +67,7 @@ void arm_svm_sigmoid_predict_f16(
     uint32_t         row;
     uint32_t         blkCnt;     /* loop counters */
     const float16_t *pDualCoef = S->dualCoefficients;
-    float16_t       sum = S->intercept;
+    _Float16       sum = S->intercept;
     f16x8_t         vSum = vdupq_n_f16(0.0f);
 
     row = numRows;
@@ -305,18 +305,18 @@ void arm_svm_sigmoid_predict_f16(
     int32_t * pResult)
 {
     _Float16 sum=S->intercept;
-    _Float16 dot=0;
+    _Float16 dot=0.0f16;
     uint32_t i,j;
     const float16_t *pSupport = S->supportVectors;
 
     for(i=0; i < S->nbOfSupportVectors; i++)
     {
-        dot=0;
+        dot=0.0f16;
         for(j=0; j < S->vectorDimension; j++)
         {
-            dot = dot + in[j]* *pSupport++;
+            dot = dot + (_Float16)in[j] * (_Float16)*pSupport++;
         }
-        sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
+        sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((_Float16)S->gamma * dot + (_Float16)S->coef0);
     }
     *pResult=S->classes[STEP(sum)];
 }

+ 2 - 2
Source/StatisticsFunctions/arm_entropy_f16.c

@@ -65,7 +65,7 @@
 float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
 {
     uint32_t        blkCnt;
-    float16_t       accum=0.0f,p;
+    _Float16       accum=0.0f16,p;
 
 
     blkCnt = blockSize;
@@ -110,7 +110,7 @@ float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
 {
     const float16_t *pIn;
     uint32_t blkCnt;
-    float16_t accum, p;
+    _Float16 accum, p;
  
     pIn = pSrcA;
     blkCnt = blockSize;

+ 3 - 3
Source/StatisticsFunctions/arm_kullback_leibler_f16.c

@@ -72,12 +72,12 @@
 float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
 {
     uint32_t blkCnt;
-    float16_t accum, pA,pB;
+    _Float16 accum, pA,pB;
  
     
     blkCnt = blockSize;
 
-    accum = 0.0f;
+    accum = 0.0f16;
 
     f16x8_t         vSum = vdupq_n_f16(0.0f);
     blkCnt = blockSize >> 3;
@@ -120,7 +120,7 @@ float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSr
 {
     const float16_t *pInA, *pInB;
     uint32_t blkCnt;
-    float16_t accum, pA,pB;
+    _Float16 accum, pA,pB;
  
     pInA = pSrcA;
     pInB = pSrcB;

+ 5 - 5
Source/StatisticsFunctions/arm_logsumexp_f16.c

@@ -72,8 +72,8 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
     float16_t       maxVal;
     const float16_t *pIn;
     int32_t         blkCnt;
-    float16_t       accum=0.0f16;
-    float16_t       tmp;
+    _Float16       accum=0.0f16;
+    _Float16       tmp;
 
 
     arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal);
@@ -122,11 +122,11 @@ float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
 #else
 float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
 {
-    float16_t maxVal;
-    float16_t tmp;
+    _Float16 maxVal;
+    _Float16 tmp;
     const float16_t *pIn;
     uint32_t blkCnt;
-    float16_t accum;
+    _Float16 accum;
  
     pIn = in;
     blkCnt = blockSize;

+ 2 - 2
Source/StatisticsFunctions/arm_power_f16.c

@@ -88,8 +88,8 @@ void arm_power_f16(
         float16_t * pResult)
 {
         uint32_t blkCnt;                               /* Loop counter */
-        float16_t sum = 0.0f;                          /* Temporary result storage */
-        float16_t in;                                  /* Temporary variable to store input value */
+        _Float16 sum = 0.0f16;                          /* Temporary result storage */
+        _Float16 in;                                  /* Temporary variable to store input value */
 
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 

+ 2 - 2
Source/StatisticsFunctions/arm_rms_f16.c

@@ -83,8 +83,8 @@ void arm_rms_f16(
         float16_t * pResult)
 {
         uint32_t blkCnt;                               /* Loop counter */
-        float16_t sum = 0.0f;                          /* Temporary result storage */
-        float16_t in;                                  /* Temporary variable to store input value */
+        _Float16 sum = 0.0f16;                          /* Temporary result storage */
+        _Float16 in;                                  /* Temporary variable to store input value */
 
 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
 

+ 3 - 3
Testing/FrameworkSource/Error.cpp

@@ -342,7 +342,7 @@ void assert_close_error(unsigned long nb,float64_t &ref, float64_t &val, double
     if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
     {
         char details[200];
-        sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
+        sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
         throw (Error(CLOSE_ERROR,nb,details));
     }
 };
@@ -385,7 +385,7 @@ void assert_close_error(unsigned long nb,float32_t &ref, float32_t &val, double
     if (abs(val - ref) > (absthreshold + relthreshold * abs(ref)))
     {
         char details[200];
-        sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
+        sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
         throw (Error(CLOSE_ERROR,nb,details));
     }
 };
@@ -429,7 +429,7 @@ void assert_close_error(unsigned long nb,float16_t &ref, float16_t &val, double
     if (abs((float)val - (float)ref) > (absthreshold + relthreshold * abs((float)ref)))
     {
         char details[200];
-        sprintf(details,"close %g : abs=%g, rel=%g",abs(val - ref) , absthreshold,relthreshold);
+        sprintf(details,"close error %g > %g: (val = %g, ref = %g)",abs(val - ref) , absthreshold + relthreshold * abs(ref),val,ref);
         throw (Error(CLOSE_ERROR,nb,details));
     }
 };

+ 6 - 6
Testing/PatternGeneration/Support.py

@@ -11,7 +11,7 @@ NBVECTORS = [4,10,16]
 
 VECDIM = [12,14,20]
 
-def genWsum(config,nb):
+def genWsum(config,f,nb):
     DIM=50
     inputs=[] 
     weights=[]
@@ -23,15 +23,15 @@ def genWsum(config,nb):
     inputs += list(va)
     weights += list(vb)
 
-    nbiters = Tools.loopnb(0,Tools.TAILONLY)
+    nbiters = Tools.loopnb(f,Tools.TAILONLY)
     e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) 
     output.append(e)
 
-    nbiters = Tools.loopnb(0,Tools.BODYONLY)
+    nbiters = Tools.loopnb(f,Tools.BODYONLY)
     e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) 
     output.append(e)
 
-    nbiters = Tools.loopnb(0,Tools.BODYANDTAIL)
+    nbiters = Tools.loopnb(f,Tools.BODYANDTAIL)
     e = np.sum(va[0:nbiters].T * vb[0:nbiters]) / np.sum(vb[0:nbiters]) 
     output.append(e)
 
@@ -92,7 +92,7 @@ def writeTestsF32(config):
 
 
     # This is for benchmarking the weighted sum and we use only one test pattern
-    genWsum(config,6)
+    genWsum(config,Tools.F32,6)
     
 
 def writeTestsF16(config):
@@ -105,7 +105,7 @@ def writeTestsF16(config):
     config.writeInput(11,va,"Samples")
 
     # This is for benchmarking the weighted sum and we use only one test pattern
-    genWsum(config,6)
+    genWsum(config,Tools.F16,6)
 
 def writeTestsQ31(config):
     NBSAMPLES=256

+ 100 - 100
Testing/Patterns/DSP/Support/SupportF16/Inputs6_f16.txt

@@ -1,102 +1,102 @@
 H
 50
-// 0.549118
-0x3865
-// 0.782574
-0x3a43
-// 0.537389
-0x384d
-// 0.576597
-0x389d
-// 0.849496
-0x3acc
-// 0.764850
-0x3a1e
-// 0.539151
-0x3850
-// 0.459163
-0x3759
-// 0.352476
-0x35a4
-// 0.333343
-0x3555
-// 0.362662
-0x35cd
-// 0.343928
-0x3581
-// 0.046084
-0x29e6
-// 0.583088
-0x38aa
-// 0.464789
-0x3770
-// 0.659567
-0x3947
-// 0.177360
-0x31ad
-// 0.465121
-0x3771
-// 0.849010
-0x3acb
-// 0.748383
-0x39fd
-// 0.493254
-0x37e4
-// 0.023579
-0x2609
-// 0.354457
-0x35ac
-// 0.880690
-0x3b0c
-// 0.029813
-0x27a2
-// 0.338038
-0x3569
-// 0.724386
-0x39cc
-// 0.341454
-0x3577
-// 0.676073
-0x3969
-// 0.041176
-0x2945
-// 0.214019
-0x32d9
-// 0.322539
-0x3529
-// 0.499026
-0x37fc
-// 0.616038
-0x38ee
-// 0.688882
-0x3983
-// 0.066439
-0x2c41
-// 0.094981
-0x2e14
-// 0.873092
-0x3afc
-// 0.642707
-0x3924
-// 0.537527
-0x384d
-// 0.370953
-0x35ef
-// 0.450042
-0x3733
-// 0.434679
-0x36f4
-// 0.676789
-0x396a
-// 0.303649
-0x34dc
-// 0.076930
-0x2cec
-// 0.871305
-0x3af8
-// 0.855053
-0x3ad7
-// 0.498935
-0x37fc
-// 0.854980
-0x3ad7
+// 0.145259
+0x30a6
+// 0.431775
+0x36e9
+// 0.925328
+0x3b67
+// 0.170239
+0x3173
+// 0.385773
+0x362c
+// 0.998601
+0x3bfd
+// 0.108755
+0x2ef6
+// 0.479921
+0x37ae
+// 0.592423
+0x38bd
+// 0.431234
+0x36e6
+// 0.413843
+0x369f
+// 0.959014
+0x3bac
+// 0.665528
+0x3953
+// 0.952834
+0x3b9f
+// 0.562821
+0x3881
+// 0.609152
+0x38e0
+// 0.890135
+0x3b1f
+// 0.002624
+0x1960
+// 0.666853
+0x3956
+// 0.813988
+0x3a83
+// 0.170387
+0x3174
+// 0.264551
+0x343c
+// 0.699111
+0x3998
+// 0.738154
+0x39e8
+// 0.475053
+0x379a
+// 0.439282
+0x3707
+// 0.699736
+0x3999
+// 0.488956
+0x37d3
+// 0.115565
+0x2f65
+// 0.115452
+0x2f64
+// 0.586420
+0x38b1
+// 0.327648
+0x353e
+// 0.442207
+0x3713
+// 0.752186
+0x3a04
+// 0.476363
+0x379f
+// 0.491094
+0x37dc
+// 0.552734
+0x386c
+// 0.674894
+0x3966
+// 0.705545
+0x39a5
+// 0.966659
+0x3bbc
+// 0.528146
+0x383a
+// 0.633660
+0x3912
+// 0.049129
+0x2a4a
+// 0.458540
+0x3756
+// 0.331162
+0x354c
+// 0.399161
+0x3663
+// 0.015650
+0x2402
+// 0.271388
+0x3458
+// 0.912530
+0x3b4d
+// 0.655578
+0x393f

+ 6 - 6
Testing/Patterns/DSP/Support/SupportF16/Ref6_f16.txt

@@ -1,8 +1,8 @@
 H
 3
-// 0.544937
-0x385c
-// 0.581009
-0x38a6
-// 0.504352
-0x3809
+// 0.445338
+0x3720
+// 0.569713
+0x388f
+// 0.564358
+0x3884

+ 100 - 100
Testing/Patterns/DSP/Support/SupportF16/Weights6_f16.txt

@@ -1,102 +1,102 @@
 H
 50
-// 0.873606
-0x3afd
-// 0.005892
-0x1e09
-// 0.669526
-0x395b
-// 0.198146
-0x3257
-// 0.032904
-0x2836
-// 0.829321
-0x3aa2
-// 0.044536
-0x29b3
-// 0.842708
-0x3abe
-// 0.981319
-0x3bda
-// 0.274000
-0x3462
-// 0.509194
-0x3813
-// 0.994965
-0x3bf6
-// 0.572766
-0x3895
-// 0.557596
-0x3876
-// 0.306704
-0x34e8
-// 0.983822
-0x3bdf
-// 0.145555
-0x30a8
-// 0.539088
-0x3850
-// 0.527526
-0x3838
-// 0.501409
-0x3803
-// 0.249480
-0x33fc
-// 0.273503
-0x3460
-// 0.567660
-0x388b
-// 0.741561
-0x39ef
-// 0.458026
-0x3754
-// 0.239162
-0x33a7
-// 0.731739
-0x39db
-// 0.662458
-0x394d
-// 0.024779
-0x2658
-// 0.086811
-0x2d8e
-// 0.660991
-0x394a
-// 0.834424
-0x3aad
-// 0.163672
-0x313d
-// 0.120432
-0x2fb5
-// 0.593488
-0x38bf
-// 0.973784
-0x3bca
-// 0.167473
-0x315c
-// 0.858171
-0x3ade
-// 0.986637
-0x3be5
-// 0.223556
-0x3327
-// 0.382377
-0x361e
-// 0.757667
-0x3a10
-// 0.032219
-0x2820
-// 0.574024
-0x3898
-// 0.125286
-0x3002
-// 0.946997
-0x3b93
-// 0.942443
-0x3b8a
-// 0.152563
-0x30e2
-// 0.240567
-0x33b3
-// 0.160261
-0x3121
+// 0.305271
+0x34e2
+// 0.032698
+0x282f
+// 0.266409
+0x3443
+// 0.617540
+0x38f1
+// 0.152922
+0x30e5
+// 0.774995
+0x3a33
+// 0.848550
+0x3aca
+// 0.523429
+0x3830
+// 0.116035
+0x2f6d
+// 0.649223
+0x3932
+// 0.246758
+0x33e5
+// 0.826661
+0x3a9d
+// 0.967839
+0x3bbe
+// 0.283239
+0x3488
+// 0.847203
+0x3ac7
+// 0.720735
+0x39c4
+// 0.477014
+0x37a2
+// 0.633504
+0x3911
+// 0.596646
+0x38c6
+// 0.832505
+0x3aa9
+// 0.135769
+0x3058
+// 0.607454
+0x38dc
+// 0.914942
+0x3b52
+// 0.474766
+0x3799
+// 0.781271
+0x3a40
+// 0.084347
+0x2d66
+// 0.038805
+0x28f8
+// 0.055045
+0x2b0c
+// 0.334330
+0x3559
+// 0.331283
+0x354d
+// 0.595124
+0x38c3
+// 0.443636
+0x3719
+// 0.612722
+0x38e7
+// 0.152437
+0x30e1
+// 0.311869
+0x34fd
+// 0.177173
+0x31ab
+// 0.540216
+0x3852
+// 0.941623
+0x3b88
+// 0.666747
+0x3955
+// 0.002608
+0x1958
+// 0.092911
+0x2df2
+// 0.129786
+0x3027
+// 0.710137
+0x39ae
+// 0.488728
+0x37d2
+// 0.478723
+0x37a9
+// 0.627602
+0x3905
+// 0.001795
+0x175b
+// 0.299980
+0x34cd
+// 0.636209
+0x3917
+// 0.102662
+0x2e92

+ 2 - 2
Testing/Source/Tests/BIQUADF16.cpp

@@ -10,8 +10,8 @@ Reference patterns are generated with
 a double precision computation.
 
 */
-#define REL_ERROR (3.0e-3)
-#define ABS_ERROR (3.5e-2)
+#define REL_ERROR (5.0e-2)
+#define ABS_ERROR (1.0e-1)
 
     void BIQUADF16::test_biquad_cascade_df1_ref()
     {

+ 1 - 1
Testing/Source/Tests/DistanceTestsF16.cpp

@@ -3,7 +3,7 @@
 #include "Error.h"
 #include "Test.h"
 
-#define REL_ERROR (2e-3)
+#define REL_ERROR (5e-3)
 
 #define REL_JS_ERROR (3e-2)
 

+ 3 - 3
Testing/Source/Tests/SupportTestsF16.cpp

@@ -7,8 +7,8 @@
 #define SNR_THRESHOLD 120
 #define REL_ERROR (1.0e-5)
 
-#define ABS_WEIGHTEDSUM_ERROR (5.0e-2)
-#define REL_WEIGHTEDSUM_ERROR (5.0e-2)
+#define ABS_WEIGHTEDSUM_ERROR (1.0e-1)
+#define REL_WEIGHTEDSUM_ERROR (5.0e-3)
 
 #define ABS_ERROR_F32 (1.0e-3)
 #define REL_ERROR_F32 (1.0e-3)
@@ -28,7 +28,7 @@ void SupportTestsF16::test_weighted_sum_f16()
  
  
  *outp=arm_weighted_sum_f16(inp, coefsp,this->nbSamples);
- 
+
  ASSERT_CLOSE_ERROR(*outp,refp[this->offset],ABS_WEIGHTEDSUM_ERROR,REL_WEIGHTEDSUM_ERROR);
  ASSERT_EMPTY_TAIL(output);
 

+ 7 - 7
Testing/bench.txt

@@ -754,7 +754,7 @@ group Root {
                 }
 
                 Functions {
-                   Cascaded BiQuad Filter DF2T_f64:test_biquad_cascade_df2T_f64
+                   Cascaded BiQuad Filter DF2T:test_biquad_cascade_df2T_f64
                 } -> PARAM1_ID
               }
            }
@@ -1570,8 +1570,8 @@ group Root {
                    Complex FFT:test_cfft_f32 -> CFFT_PARAM_ID
                    Real FFT:test_rfft_f32 -> RFFT_PARAM_ID
                    DCT4:test_dct4_f32 -> DCT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_f32 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_f32 -> CFFT_PARAM_ID
                 }
            }
 
@@ -1620,8 +1620,8 @@ group Root {
                    Complex FFT:test_cfft_q31 -> CFFT_PARAM_ID
                    Real FFT:test_rfft_q31 -> RFFT_PARAM_ID
                    DCT4:test_dct4_q31 -> DCT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_q31 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_q31 -> CFFT_PARAM_ID
                 }
            }
 
@@ -1670,8 +1670,8 @@ group Root {
                    Complex FFT:test_cfft_q15 -> CFFT_PARAM_ID
                    Real FFT:test_rfft_q15 -> RFFT_PARAM_ID
                    DCT4:test_dct4_q15 -> DCT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_q15 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_q15 -> CFFT_PARAM_ID
                 }
            }
         }

+ 2 - 2
Testing/bench_f16.txt

@@ -584,8 +584,8 @@ group Root {
                 Functions {
                    Complex FFT:test_cfft_f16 -> CFFT_PARAM_ID
                    Real FFT:test_rfft_f16 -> RFFT_PARAM_ID
-                   Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
-                   Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
+                   Scalar Radix 4 Complex FFT:test_cfft_radix4_f16 -> CFFT4_PARAM_ID
+                   Scalar Radix 2 Complex FFT:test_cfft_radix2_f16 -> CFFT_PARAM_ID
                 }
            }