|
|
@@ -51,83 +51,50 @@
|
|
|
|
|
|
static void arm_small_blk_max_q7(
|
|
|
const q7_t * pSrc,
|
|
|
- uint8_t blockSize,
|
|
|
+ uint16_t blockSize,
|
|
|
q7_t * pResult,
|
|
|
uint32_t * pIndex)
|
|
|
{
|
|
|
- uint32_t blkCnt; /* loop counters */
|
|
|
- q7x16_t vecSrc;
|
|
|
- q7x16_t curExtremValVec = vdupq_n_s8( Q7_MIN);
|
|
|
- q7_t maxValue = Q7_MIN, temp;
|
|
|
- uint32_t idx = blockSize;
|
|
|
- uint8x16_t indexVec;
|
|
|
- uint8x16_t curExtremIdxVec;
|
|
|
- mve_pred16_t p0;
|
|
|
-
|
|
|
-
|
|
|
- indexVec = vidupq_u8((uint32_t)0, 1);
|
|
|
- curExtremIdxVec = vdupq_n_u8(0);
|
|
|
-
|
|
|
- blkCnt = blockSize >> 4;
|
|
|
- while (blkCnt > 0U)
|
|
|
- {
|
|
|
- vecSrc = vldrbq_s8(pSrc);
|
|
|
- pSrc += 16;
|
|
|
+ int32_t blkCnt; /* loop counters */
|
|
|
+ q7x16_t extremValVec = vdupq_n_s8(Q7_MIN);
|
|
|
+ q7_t maxValue = Q7_MIN;
|
|
|
+ uint8x16_t indexVec;
|
|
|
+ uint8x16_t extremIdxVec;
|
|
|
+ mve_pred16_t p0;
|
|
|
+ uint8_t extremIdxArr[16];
|
|
|
+
|
|
|
+ indexVec = vidupq_u8(0U, 1);
|
|
|
+
|
|
|
+ blkCnt = blockSize;
|
|
|
+ do {
|
|
|
+ mve_pred16_t p = vctp8q(blkCnt);
|
|
|
+ q7x16_t extremIdxVal = vld1q_z(pSrc, p);
|
|
|
/*
|
|
|
* Get current max per lane and current index per lane
|
|
|
* when a max is selected
|
|
|
*/
|
|
|
- p0 = vcmpgeq(vecSrc, curExtremValVec);
|
|
|
- curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
|
|
|
- curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
|
|
|
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
|
|
|
|
|
|
- indexVec = indexVec + 16;
|
|
|
- /*
|
|
|
- * Decrement the blockSize loop counter
|
|
|
- */
|
|
|
- blkCnt--;
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- /*
|
|
|
- * Get max value across the vector
|
|
|
- */
|
|
|
- maxValue = vmaxvq(maxValue, curExtremValVec);
|
|
|
- /*
|
|
|
- * set index for lower values to max possible index
|
|
|
- */
|
|
|
- p0 = vcmpgeq(curExtremValVec, maxValue);
|
|
|
- indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
|
|
|
- /*
|
|
|
- * Get min index which is thus for a max value
|
|
|
- */
|
|
|
- idx = vminvq(idx, indexVec);
|
|
|
-
|
|
|
- /*
|
|
|
- * tail
|
|
|
- */
|
|
|
- blkCnt = blockSize & 0xF;
|
|
|
-
|
|
|
- while (blkCnt > 0U)
|
|
|
- {
|
|
|
- /* Initialize temp to the next consecutive values one by one */
|
|
|
- temp = *pSrc++;
|
|
|
-
|
|
|
- /* compare for the maximum value */
|
|
|
- if (maxValue < temp)
|
|
|
- {
|
|
|
- /* Update the maximum value and it's index */
|
|
|
- maxValue = temp;
|
|
|
- idx = blockSize - blkCnt;
|
|
|
- }
|
|
|
-
|
|
|
- /* Decrement loop counter */
|
|
|
- blkCnt--;
|
|
|
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
|
|
|
+ /* store per-lane extrema indexes */
|
|
|
+ vst1q_p(extremIdxArr, indexVec, p0);
|
|
|
+
|
|
|
+ indexVec += 16;
|
|
|
+ pSrc += 16;
|
|
|
+ blkCnt -= 16;
|
|
|
}
|
|
|
- /*
|
|
|
- * Save result
|
|
|
- */
|
|
|
- *pIndex = idx;
|
|
|
+ while (blkCnt > 0);
|
|
|
+
|
|
|
+
|
|
|
+ /* Get max value across the vector */
|
|
|
+ maxValue = vmaxvq(maxValue, extremValVec);
|
|
|
+
|
|
|
+ /* set index for lower values to max possible index */
|
|
|
+ p0 = vcmpgeq(extremValVec, maxValue);
|
|
|
+ extremIdxVec = vld1q(extremIdxArr);
|
|
|
+
|
|
|
+ indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
|
|
|
+ *pIndex = vminvq_u8(blockSize - 1, indexVec);
|
|
|
*pResult = maxValue;
|
|
|
}
|
|
|
|
|
|
@@ -138,8 +105,9 @@ void arm_max_q7(
|
|
|
uint32_t * pIndex)
|
|
|
{
|
|
|
int32_t totalSize = blockSize;
|
|
|
+ const uint16_t sub_blk_sz = UINT8_MAX + 1;
|
|
|
|
|
|
- if (totalSize <= UINT8_MAX)
|
|
|
+ if (totalSize <= sub_blk_sz)
|
|
|
{
|
|
|
arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex);
|
|
|
}
|
|
|
@@ -152,11 +120,11 @@ void arm_max_q7(
|
|
|
/*
|
|
|
* process blocks of 255 elts
|
|
|
*/
|
|
|
- while (totalSize >= UINT8_MAX)
|
|
|
+ while (totalSize >= sub_blk_sz)
|
|
|
{
|
|
|
const q7_t *curSrc = pSrc;
|
|
|
|
|
|
- arm_small_blk_max_q7(curSrc, UINT8_MAX, pResult, pIndex);
|
|
|
+ arm_small_blk_max_q7(curSrc, sub_blk_sz, pResult, pIndex);
|
|
|
if (*pResult > curBlkExtr)
|
|
|
{
|
|
|
/*
|
|
|
@@ -167,8 +135,8 @@ void arm_max_q7(
|
|
|
curBlkIdx = curIdx;
|
|
|
}
|
|
|
curIdx++;
|
|
|
- pSrc += UINT8_MAX;
|
|
|
- totalSize -= UINT8_MAX;
|
|
|
+ pSrc += sub_blk_sz;
|
|
|
+ totalSize -= sub_blk_sz;
|
|
|
}
|
|
|
/*
|
|
|
* remainder
|
|
|
@@ -180,7 +148,7 @@ void arm_max_q7(
|
|
|
curBlkPos = *pIndex;
|
|
|
curBlkIdx = curIdx;
|
|
|
}
|
|
|
- *pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
|
|
|
+ *pIndex = curBlkIdx * sub_blk_sz + curBlkPos;
|
|
|
*pResult = curBlkExtr;
|
|
|
}
|
|
|
}
|