Просмотр исходного кода

CMSIS-DSP: Improvements to MVE code for min/max.

Christophe Favergeon 4 лет назад
Родитель
Сommit
5f38ab24d5

+ 9 - 0
CMSIS/DSP/Platforms/FVP/ARMv81MML/Startup/AC6/startup_ARMv81MML.c

@@ -28,6 +28,11 @@
   #error device not specified!
 #endif
 
+#define SERIAL_BASE_ADDRESS (0xA8000000ul)
+
+#define SERIAL_DATA  *((volatile unsigned *) SERIAL_BASE_ADDRESS)
+
+
 /*----------------------------------------------------------------------------
   Exception / Interrupt Handler Function Prototype
  *----------------------------------------------------------------------------*/
@@ -138,6 +143,8 @@ void Reset_Handler(void)
  *----------------------------------------------------------------------------*/
 void HardFault_Handler(void)
 {
+  SERIAL_DATA = 'H';
+  SERIAL_DATA = '\n';
   while(1);
 }
 
@@ -146,5 +153,7 @@ void HardFault_Handler(void)
  *----------------------------------------------------------------------------*/
 void Default_Handler(void)
 {
+  SERIAL_DATA = 'D';
+  SERIAL_DATA = '\n';
   while(1);
 }

+ 34 - 63
CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c

@@ -55,78 +55,49 @@ void arm_max_q15(
         q15_t * pResult,
         uint32_t * pIndex)
 {
-    uint32_t blkCnt;           /* loop counters */
-    q15x8_t vecSrc;
-    q15x8_t curExtremValVec = vdupq_n_s16(Q15_MIN);
-    q15_t maxValue = Q15_MIN, temp;
-    uint32_t  idx = blockSize;
-    uint16x8_t indexVec;
-    uint16x8_t curExtremIdxVec;
-    mve_pred16_t p0;
-
-
-    indexVec = vidupq_u16((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u16(0);
-
-    blkCnt = blockSize >> 3;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrhq_s16(pSrc);  
-        pSrc += 8;
+    int32_t         blkCnt;     /* loop counters */
+    q15x8_t         extremValVec = vdupq_n_s16(Q15_MIN);
+    q15_t           maxValue = Q15_MIN;
+    uint16x8_t      indexVec;
+    uint16x8_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint16_t        extremIdxArr[8];
+
+    indexVec = vidupq_u16(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+        q15x8_t         extremIdxVal = vld1q_z(pSrc, p);
         /*
          * Get current max per lane and current index per lane
          * when a max is selected
          */
-        p0 = vcmpgeq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  8;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    /*
-     * Get max value across the vector
-     */
-    maxValue = vmaxvq(maxValue, curExtremValVec);
-    /*
-     * set index for lower values to max possible index
-     */
-    p0 = vcmpgeq(curExtremValVec, maxValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
-    /*
-     * Get min index which is thus for a max value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /* Tail */
-    blkCnt = blockSize & 0x7;
-    while (blkCnt > 0U)
-    {
-      /* Initialize temp to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the maximum value */
-      if (maxValue < temp)
-      {
-        /* Update the maximum value and it's index */
-        maxValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p(extremIdxArr, indexVec, p0);
+
+        indexVec += 8;
+        pSrc += 8;
+        blkCnt -= 8;
     }
+    while (blkCnt > 0);
+
 
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = maxValue;
 }
+
 #else
 void arm_max_q15(
   const q15_t * pSrc,

+ 38 - 70
CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c

@@ -50,86 +50,54 @@
 #include "arm_helium_utils.h"
 
 void arm_max_q31(
-  const q31_t * pSrc,
-        uint32_t blockSize,
-        q31_t * pResult,
-        uint32_t * pIndex)
+    const q31_t * pSrc,
+    uint32_t blockSize,
+    q31_t * pResult,
+    uint32_t * pIndex)
 {
-    uint32_t  blkCnt;           /* loop counters */
-    q31x4_t vecSrc;
-    q31x4_t curExtremValVec = vdupq_n_s32( Q31_MIN);
-    q31_t maxValue = Q31_MIN;
-    q31_t temp;
-    uint32_t  idx = blockSize;
-    uint32x4_t indexVec;
-    uint32x4_t curExtremIdxVec;
-    mve_pred16_t p0;
-
-
-    indexVec = vidupq_u32((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u32(0);
-
-    /* Compute 4 outputs at a time */
-    blkCnt = blockSize >> 2U;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrwq_s32(pSrc);  
-        pSrc += 4;
+    int32_t         blkCnt;     /* loop counters */
+    q31x4_t         extremValVec = vdupq_n_s32(Q31_MIN);
+    q31_t           maxValue = Q31_MIN;
+    uint32x4_t      indexVec;
+    uint32x4_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint32_t        extremIdxArr[4];
+
+    indexVec = vidupq_u32(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+        q31x4_t         extremIdxVal = vld1q_z(pSrc, p);
         /*
          * Get current max per lane and current index per lane
          * when a max is selected
          */
-        p0 = vcmpgeq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  4;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    /*
-     * Get max value across the vector
-     */
-    maxValue = vmaxvq(maxValue, curExtremValVec);
-    /*
-     * set index for lower values to max possible index
-     */
-    p0 = vcmpgeq(curExtremValVec, maxValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
-    /*
-     * Get min index which is thus for a max value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /* Tail */
-    blkCnt = blockSize & 0x3;
-
-    while (blkCnt > 0U)
-    {
-       /* Initialize maxVal to the next consecutive values one by one */
-       temp = *pSrc++;
-   
-       /* compare for the maximum value */
-       if (maxValue < temp)
-       {
-         /* Update the maximum value and it's index */
-         maxValue = temp;
-         idx = blockSize - blkCnt;
-       }
-
-       /* Decrement loop counter */
-       blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p(extremIdxArr, indexVec, p0);
+
+        indexVec += 4;
+        pSrc += 4;
+        blkCnt -= 4;
     }
+    while (blkCnt > 0);
 
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = maxValue;
 }
+
 #else
 void arm_max_q31(
   const q31_t * pSrc,

+ 42 - 74
CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c

@@ -51,83 +51,50 @@
 
 static void arm_small_blk_max_q7(
     const q7_t * pSrc,
-    uint8_t blockSize,
+    uint16_t blockSize,
     q7_t * pResult,
     uint32_t * pIndex)
 {
-    uint32_t        blkCnt;           /* loop counters */
-    q7x16_t         vecSrc;
-    q7x16_t         curExtremValVec = vdupq_n_s8( Q7_MIN);
-    q7_t            maxValue = Q7_MIN, temp;
-    uint32_t        idx = blockSize;
-    uint8x16_t      indexVec;
-    uint8x16_t      curExtremIdxVec;
-    mve_pred16_t    p0;
-
-
-    indexVec = vidupq_u8((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u8(0);
-
-    blkCnt = blockSize >> 4;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrbq_s8(pSrc);  
-        pSrc += 16;
+    int32_t        blkCnt;     /* loop counters */
+    q7x16_t        extremValVec = vdupq_n_s8(Q7_MIN);
+    q7_t           maxValue = Q7_MIN;
+    uint8x16_t     indexVec;
+    uint8x16_t     extremIdxVec;
+    mve_pred16_t   p0;
+    uint8_t        extremIdxArr[16];
+
+    indexVec = vidupq_u8(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp8q(blkCnt);
+        q7x16_t         extremIdxVal = vld1q_z(pSrc, p);
         /*
          * Get current max per lane and current index per lane
          * when a max is selected
          */
-        p0 = vcmpgeq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  16;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    
-    /*
-     * Get max value across the vector
-     */
-    maxValue = vmaxvq(maxValue, curExtremValVec);
-    /*
-     * set index for lower values to max possible index
-     */
-    p0 = vcmpgeq(curExtremValVec, maxValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
-    /*
-     * Get min index which is thus for a max value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /*
-     * tail
-     */
-    blkCnt = blockSize & 0xF;
-
-    while (blkCnt > 0U)
-    {
-      /* Initialize temp to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the maximum value */
-      if (maxValue < temp)
-      {
-        /* Update the maximum value and it's index */
-        maxValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p(extremIdxArr, indexVec, p0);
+
+        indexVec += 16;
+        pSrc += 16;
+        blkCnt -= 16;
     }
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    while (blkCnt > 0);
+
+
+    /* Get max value across the vector   */
+    maxValue = vmaxvq(maxValue, extremValVec);
+
+    /* set index for lower values to max possible index   */
+    p0 = vcmpgeq(extremValVec, maxValue);
+    extremIdxVec = vld1q(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
+    *pIndex = vminvq_u8(blockSize - 1, indexVec);
     *pResult = maxValue;
 }
 
@@ -138,8 +105,9 @@ void arm_max_q7(
         uint32_t * pIndex)
 {
     int32_t   totalSize = blockSize;
+    const uint16_t sub_blk_sz = UINT8_MAX + 1;
 
-    if (totalSize <= UINT8_MAX)
+    if (totalSize <= sub_blk_sz)
     {
         arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex);
     }
@@ -152,11 +120,11 @@ void arm_max_q7(
         /*
          * process blocks of 255 elts
          */
-        while (totalSize >= UINT8_MAX)
+        while (totalSize >= sub_blk_sz)
         {
             const q7_t     *curSrc = pSrc;
 
-            arm_small_blk_max_q7(curSrc, UINT8_MAX, pResult, pIndex);
+            arm_small_blk_max_q7(curSrc, sub_blk_sz, pResult, pIndex);
             if (*pResult > curBlkExtr)
             {
                 /*
@@ -167,8 +135,8 @@ void arm_max_q7(
                 curBlkIdx = curIdx;
             }
             curIdx++;
-            pSrc += UINT8_MAX;
-            totalSize -= UINT8_MAX;
+            pSrc += sub_blk_sz;
+            totalSize -= sub_blk_sz;
         }
         /*
          * remainder
@@ -180,7 +148,7 @@ void arm_max_q7(
             curBlkPos = *pIndex;
             curBlkIdx = curIdx;
         }
-        *pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
+        *pIndex = curBlkIdx * sub_blk_sz + curBlkPos;
         *pResult = curBlkExtr;
     }
 }

+ 31 - 62
CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c

@@ -56,79 +56,48 @@ void arm_min_q15(
         q15_t * pResult,
         uint32_t * pIndex)
 {
-    uint32_t  blkCnt;           /* loop counters */
-    q15x8_t vecSrc;
-    q15x8_t curExtremValVec = vdupq_n_s16(Q15_MAX);
-    q15_t minValue = Q15_MAX,temp;
-    uint32_t  idx = blockSize;
-    uint16x8_t indexVec;
-    uint16x8_t curExtremIdxVec;
-    mve_pred16_t p0;
 
+    int32_t         blkCnt;     /* loop counters */
+    q15x8_t         extremValVec = vdupq_n_s16(Q15_MAX);
+    q15_t           minValue = Q15_MAX;
+    uint16x8_t      indexVec;
+    uint16x8_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint16_t        extremIdxArr[8];
 
-    indexVec = vidupq_u16((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u16(0);
+    indexVec = vidupq_u16(0U, 1);
 
-    blkCnt = blockSize >> 3;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrhq_s16(pSrc);  
-        pSrc += 8;
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp16q(blkCnt);
+        q15x8_t         extremIdxVal = vld1q_z(pSrc, p);
         /*
          * Get current min per lane and current index per lane
          * when a min is selected
          */
-        p0 = vcmpleq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  8;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-   
-    /*
-     * Get min value across the vector
-     */
-    minValue = vminvq(minValue, curExtremValVec);
-    /*
-     * set index for lower values to min possible index
-     */
-    p0 = vcmpleq(curExtremValVec, minValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
-    /*
-     * Get min index which is thus for a min value
-     */
-    idx = vminvq(idx, indexVec);
-
-    /*
-     * tail
-    */
-    blkCnt = blockSize & 7;
-    while (blkCnt > 0U)
-    {
-      /* Initialize minVal to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the minimum value */
-      if (minValue > temp)
-      {
-        /* Update the minimum value and it's index */
-        minValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p(extremIdxArr, indexVec, p0);
+
+        indexVec += 8;
+        pSrc += 8;
+        blkCnt -= 8;
     }
+    while (blkCnt > 0);
+
+    /* Get min value across the vector   */
+    minValue = vminvq(minValue, extremValVec);
+
+    /* set index for lower values to min possible index   */
+    p0 = vcmpleq(extremValVec, minValue);
+    extremIdxVec = vld1q(extremIdxArr);
 
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = minValue;
+ 
 }
 #else
 void arm_min_q15(

+ 35 - 65
CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c

@@ -56,79 +56,49 @@ void arm_min_q31(
         q31_t * pResult,
         uint32_t * pIndex)
 {
-    uint32_t  blkCnt;           /* loop counters */
-    q31x4_t vecSrc;
-    q31x4_t curExtremValVec = vdupq_n_s32(Q31_MAX);
-    q31_t minValue = Q31_MAX, temp;
-    uint32_t  idx = blockSize;
-    uint32x4_t indexVec;
-    uint32x4_t curExtremIdxVec;
-    mve_pred16_t p0;
-
-
-    indexVec = vidupq_u32((uint32_t)0, 1);
-    curExtremIdxVec = vdupq_n_u32(0);
-
-    /* Compute 4 outputs at a time */
-    blkCnt = blockSize >> 2U;
-    while (blkCnt > 0U)
-    {
-        vecSrc = vldrwq_s32(pSrc);  
-        pSrc += 4;
+    int32_t         blkCnt;     /* loop counters */
+    q31x4_t         extremValVec = vdupq_n_s32(Q31_MAX);
+    q31_t           minValue = Q31_MAX;
+    uint32x4_t      indexVec;
+    uint32x4_t      extremIdxVec;
+    mve_pred16_t    p0;
+    uint32_t        extremIdxArr[4];
+
+    indexVec = vidupq_u32(0U, 1);
+
+    blkCnt = blockSize;
+    do {
+        mve_pred16_t    p = vctp32q(blkCnt);
+        q31x4_t         extremIdxVal = vld1q_z(pSrc, p);
         /*
          * Get current min per lane and current index per lane
          * when a min is selected
          */
-        p0 = vcmpleq(vecSrc, curExtremValVec);
-        curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
-        curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+        p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
 
-        indexVec = indexVec +  4;
-        /*
-         * Decrement the blockSize loop counter
-         */
-        blkCnt--;
-    }
-    
-    /*
-     * Get min value across the vector
-     */
-    minValue = vminvq(minValue, curExtremValVec);
-    /*
-     * set index for lower values to min possible index
-     */
-    p0 = vcmpleq(curExtremValVec, minValue);
-    indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
-    /*
-     * Get min index which is thus for a min value
-     */
-    idx = vminvq(idx, indexVec);
-
-
-    /* Tail */
-    blkCnt = blockSize & 0x3;
-    while (blkCnt > 0U)
-    {
-      /* Initialize temp to the next consecutive values one by one */
-      temp = *pSrc++;
-  
-      /* compare for the minimum value */
-      if (minValue > temp)
-      {
-        /* Update the minimum value and it's index */
-        minValue = temp;
-        idx = blockSize - blkCnt;
-      }
-  
-      /* Decrement loop counter */
-      blkCnt--;
+        extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+        /* store per-lane extrema indexes */
+        vst1q_p(extremIdxArr, indexVec, p0);
+
+        indexVec += 4;
+        pSrc += 4;
+        blkCnt -= 4;
     }
-    /*
-     * Save result
-     */
-    *pIndex = idx;
+    while (blkCnt > 0);
+
+
+    /* Get min value across the vector   */
+    minValue = vminvq(minValue, extremValVec);
+
+    /* set index for lower values to min possible index   */
+    p0 = vcmpleq(extremValVec, minValue);
+    extremIdxVec = vld1q(extremIdxArr);
+
+    indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+    *pIndex = vminvq(blockSize - 1, indexVec);
     *pResult = minValue;
 }
+
 #else
 void arm_min_q31(
   const q31_t * pSrc,