vor 4 Jahren · 97a91fdd38
--- a/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
+++ b/CMSIS/DSP/PrivateInclude/arm_vec_fft.h
@@ -150,20 +150,20 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
 
				     bitRevTabOff = vldrhq_u16(pBitRevTab);
			
 
				     pBitRevTab += 8;
			
 
				 
			
 
				-    bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
			
 
				-    bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
			
 
				-    bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
			
 
				-    bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
			
 
				+    bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+    bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+    bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
			
 
				+    bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
			
 
				 
			
 
				     blkCnt = (bitRevLen / 16);
			
 
				     while (blkCnt > 0) {
			
 
				         bitRevTabOff = vldrhq_u16(pBitRevTab);
			
 
				         pBitRevTab += 8;
			
 
				 
			
 
				-        bitRevOff1Low = vmullbq_int_u16(bitRevTabOff, one);
			
 
				-        bitRevOff1High = vmulltq_int_u16(bitRevTabOff, one);
			
 
				-        bitRevOff1Low = vshrq_n_u16(bitRevOff1Low, 3);
			
 
				-        bitRevOff1High = vshrq_n_u16(bitRevOff1High, 3);
			
 
				+        bitRevOff1Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+        bitRevOff1High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+        bitRevOff1Low = vshrq_n_u16((uint16x8_t)bitRevOff1Low, 3);
			
 
				+        bitRevOff1High = vshrq_n_u16((uint16x8_t)bitRevOff1High, 3);
			
 
				 
			
 
				         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff0Low);
			
 
				         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff0High);
			
@@ -175,10 +175,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
 
				         bitRevTabOff = vldrhq_u16(pBitRevTab);
			
 
				         pBitRevTab += 8;
			
 
				 
			
 
				-        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
			
 
				-        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
			
 
				-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
			
 
				-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
			
 
				+        bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+        bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+        bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
			
 
				+        bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
			
 
				 
			
 
				         inLow = vldrwq_gather_shifted_offset_u32(src, bitRevOff1Low);
			
 
				         inHigh = vldrwq_gather_shifted_offset_u32(src, bitRevOff1High);
			
@@ -209,10 +209,10 @@ __STATIC_INLINE void arm_bitreversal_16_inpl_mve(
 
				         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0Low, inHigh);
			
 
				         vstrwq_scatter_shifted_offset_u32(src, bitRevOff0High, inLow);
			
 
				 
			
 
				-        bitRevOff0Low = vmullbq_int_u16(bitRevTabOff, one);
			
 
				-        bitRevOff0High = vmulltq_int_u16(bitRevTabOff, one);
			
 
				-        bitRevOff0Low = vshrq_n_u16(bitRevOff0Low, 3);
			
 
				-        bitRevOff0High = vshrq_n_u16(bitRevOff0High, 3);
			
 
				+        bitRevOff0Low = vmullbq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+        bitRevOff0High = vmulltq_int_u16((uint16x8_t)bitRevTabOff, one);
			
 
				+        bitRevOff0Low = vshrq_n_u16((uint16x8_t)bitRevOff0Low, 3);
			
 
				+        bitRevOff0High = vshrq_n_u16((uint16x8_t)bitRevOff0High, 3);
			
 
				 
			
 
				         inLow = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0Low, p);
			
 
				         inHigh = vldrwq_gather_shifted_offset_z_u32(src, bitRevOff0High, p);
			
@@ -251,13 +251,13 @@ __STATIC_INLINE void arm_bitreversal_32_outpl_mve(void *pDst, void *pSrc, uint32
 
				     while (blkCnt > 0) {
			
 
				         uint64x2_t      vecIn;
			
 
				 
			
 
				-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs0);
			
 
				+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs0);
			
 
				         idxOffs0 = idxOffs0 + 16;
			
 
				         vst1q(pDst32, (uint32x4_t) vecIn);
			
 
				         pDst32 += 4;
			
 
				         bitRevOffs0 = vbrsrq(idxOffs0, bitRevPos);
			
 
				 
			
 
				-        vecIn = vldrdq_gather_offset_u64(pSrc, (int64x2_t) bitRevOffs1);
			
 
				+        vecIn = vldrdq_gather_offset_u64(pSrc, (uint64x2_t) bitRevOffs1);
			
 
				         idxOffs1 = idxOffs1 + 16;
			
 
				         vst1q(pDst32, (uint32x4_t) vecIn);
			
 
				         pDst32 += 4;
			
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f16.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_f16.c
@@ -32,9 +32,9 @@
 
				 #if defined(ARM_FLOAT16_SUPPORTED)
			
 
				 
			
 
				 /* Degree of the polynomial approximation */
			
 
				-#define NB_DEG_LOGF16 3 
			
 
				+#define NB_DEG_LOGF16 3
			
 
				 
			
 
				-/* 
			
 
				+/*
			
 
				 Related to the Log2 of the number of approximations.
			
 
				 For instance, with 3 there are 1 + 2^3 polynomials
			
 
				 */
			
@@ -55,7 +55,7 @@ nb = 3;
 
				 deg = 3;
			
 
				 lut = Table[
			
 
				    MiniMaxApproximation[
			
 
				-     Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0}, 
			
 
				+     Log[x/2^nb + i], {x, {10^-6, 1.0/2^nb}, deg, 0},
			
 
				      MaxIterations -> 1000][[2, 1]], {i, 1, 2, (1.0/2^nb)}];
			
 
				 coefs = Chop@Flatten[CoefficientList[lut, x]];
			
 
				 
			
@@ -75,8 +75,8 @@ static float16_t lut_logf16[NB_LUT_LOGF16]={
 
				 float16_t logf16_scalar(float16_t x)
			
 
				 {
			
 
				     int16_t i =  arm_typecast_s16_f16(x);
			
 
				-     
			
 
				-    int32_t vecExpUnBiased = (i >> 10) - 15;  
			
 
				+
			
 
				+    int32_t vecExpUnBiased = (i >> 10) - 15;
			
 
				     i = i - (vecExpUnBiased << 10);
			
 
				     float16_t vecTmpFlt1 = arm_typecast_f16_s16(i);
			
 
				 
			
@@ -85,7 +85,7 @@ float16_t logf16_scalar(float16_t x)
 
				     float16_t tmp,v;
			
 
				 
			
 
				     tmp = ((_Float16)vecTmpFlt1 - 1.0f16) * (1 << NB_DIV_LOGF16);
			
 
				-    n = floor((double)tmp);
			
 
				+    n = (int)floor((double)tmp);
			
 
				     v = (_Float16)tmp - (_Float16)n;
			
 
				 
			
 
				     lut = lut_logf16 + n * (1+NB_DEG_LOGF16);
			
@@ -110,9 +110,9 @@ float16_t logf16_scalar(float16_t x)
 
				 
			
 
				 float16x8_t vlogq_lut_f16(float16x8_t vecIn)
			
 
				 {
			
 
				-    int16x8_t i =  vreinterpretq_s16_f16(vecIn);      
			
 
				-     
			
 
				-    int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15); 
			
 
				+    int16x8_t i =  vreinterpretq_s16_f16(vecIn);
			
 
				+
			
 
				+    int16x8_t vecExpUnBiased = vsubq_n_s16(vshrq_n_s16(i,10), 15);
			
 
				     i = vsubq_s16(i,vshlq_n_s16(vecExpUnBiased,10));
			
 
				     float16x8_t vecTmpFlt1 = vreinterpretq_f16_s16(i);
			
 
				 
			
@@ -132,12 +132,12 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
 
				     offset = vmulq_n_s16(n,(1+NB_DEG_LOGF16));
			
 
				     offset = vaddq_n_s16(offset,NB_DEG_LOGF16-1);
			
 
				 
			
 
				-    res = vldrhq_gather_shifted_offset_f16(lut_logf16,offset);
			
 
				+    res = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
			
 
				     offset = vsubq_n_s16(offset,1);
			
 
				 
			
 
				     for(int j=NB_DEG_LOGF16-2; j >=0 ; j--)
			
 
				     {
			
 
				-       lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,offset);
			
 
				+       lutV = vldrhq_gather_shifted_offset_f16(lut_logf16,(uint16x8_t)offset);
			
 
				        res = vfmaq_f16(lutV,v,res);
			
 
				        offset = vsubq_n_s16(offset,1);
			
 
				 
			
@@ -150,7 +150,7 @@ float16x8_t vlogq_lut_f16(float16x8_t vecIn)
 
				 
			
 
				 }
			
 
				 
			
 
				-#endif 
			
 
				+#endif
			
 
				 
			
 
				 /**
			
 
				   @ingroup groupFastMath
			
@@ -175,9 +175,9 @@ void arm_vlog_f16(
 
				         float16_t * pDst,
			
 
				         uint32_t blockSize)
			
 
				 {
			
 
				-   uint32_t blkCnt; 
			
 
				+   uint32_t blkCnt;
			
 
				 
			
 
				-#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)   
			
 
				+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
			
 
				    f16x8_t src;
			
 
				    f16x8_t dst;
			
 
				 
			
@@ -203,10 +203,10 @@ void arm_vlog_f16(
 
				    while (blkCnt > 0U)
			
 
				    {
			
 
				       /* C = log(A) */
			
 
				-  
			
 
				+
			
 
				       /* Calculate log and store result in destination buffer. */
			
 
				       *pDst++ = logf16_scalar(*pSrc++);
			
 
				-  
			
 
				+
			
 
				       /* Decrement loop counter */
			
 
				       blkCnt--;
			
 
				    }
			
@@ -219,4 +219,4 @@ void arm_vlog_f16(
 
				  */
			
 
				 
			
 
				 
			
 
				-#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */ 
			
 
				+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
			
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_q15.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_q15.c
@@ -32,9 +32,9 @@
 
				 
			
 
				 #define LOG_Q15_ACCURACY 15
			
 
				 
			
 
				-/* Bit to represent the normalization factor 
			
 
				+/* Bit to represent the normalization factor
			
 
				    It is Ceiling[Log2[LOG_Q15_ACCURACY]] of the previous value.
			
 
				-   The Log2 algorithm is assuming that the value x is 
			
 
				+   The Log2 algorithm is assuming that the value x is
			
 
				    1 <= x < 2.
			
 
				 
			
 
				    But input value could be as small a 2^-LOG_Q15_ACCURACY
			
@@ -90,7 +90,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
 
				 
			
 
				 
			
 
				    /* Compute the Log2. Result is in q11 instead of q16
			
 
				-      because we know 0 <= y < 1.0 but 
			
 
				+      because we know 0 <= y < 1.0 but
			
 
				       we want a result allowing to do a
			
 
				       product on int16 rather than having to go
			
 
				       through int32
			
@@ -98,7 +98,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
 
				    for(i = 0; i < LOG_Q15_ACCURACY ; i++)
			
 
				    {
			
 
				       x = (((int32_t)x*x)) >> (LOG_Q15_ACCURACY - 1);
			
 
				-    
			
 
				+
			
 
				       if (x >= LOQ_Q15_THRESHOLD)
			
 
				       {
			
 
				          y += inc ;
			
@@ -108,10 +108,10 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
 
				    }
			
 
				 
			
 
				 
			
 
				-   /* 
			
 
				+   /*
			
 
				       Convert the Log2 to Log and apply normalization.
			
 
				       We compute (y - normalisation) * (1 / Log2[e]).
			
 
				-      
			
 
				+
			
 
				    */
			
 
				 
			
 
				    /* q11 */
			
@@ -120,7 +120,7 @@ static uint16_t arm_scalar_log_q15(uint16_t src)
 
				 
			
 
				    /* q4.11 */
			
 
				    y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
			
 
				-  
			
 
				+
			
 
				    return(y);
			
 
				 
			
 
				 }
			
@@ -146,11 +146,11 @@ q15x8_t vlogq_q15(q15x8_t src)
 
				 
			
 
				    /* q11 */
			
 
				    uint16x8_t y = vdupq_n_u16(0);
			
 
				-   
			
 
				+
			
 
				 
			
 
				    /* q11 */
			
 
				    int16x8_t vtmp;
			
 
				-  
			
 
				+
			
 
				 
			
 
				    mve_pred16_t p;
			
 
				 
			
@@ -158,11 +158,11 @@ q15x8_t vlogq_q15(q15x8_t src)
 
				 
			
 
				 
			
 
				    vtmp = vsubq_n_s16(c,1);
			
 
				-   x = vshlq_u16(src,vtmp);
			
 
				+   x = vshlq_u16((uint16x8_t)src,vtmp);
			
 
				 
			
 
				 
			
 
				    /* Compute the Log2. Result is in q11 instead of q16
			
 
				-      because we know 0 <= y < 1.0 but 
			
 
				+      because we know 0 <= y < 1.0 but
			
 
				       we want a result allowing to do a
			
 
				       product on int16 rather than having to go
			
 
				       through int32
			
@@ -172,7 +172,7 @@ q15x8_t vlogq_q15(q15x8_t src)
 
				       x = vmulhq_u16(x,x);
			
 
				       x = vshlq_n_u16(x,2);
			
 
				 
			
 
				-      
			
 
				+
			
 
				       p = vcmphiq_u16(x,vdupq_n_u16(LOQ_Q15_THRESHOLD));
			
 
				       y = vaddq_m_n_u16(y, y,inc,p);
			
 
				       x = vshrq_m_n_u16(x,x,1,p);
			
@@ -181,26 +181,26 @@ q15x8_t vlogq_q15(q15x8_t src)
 
				    }
			
 
				 
			
 
				 
			
 
				-   /* 
			
 
				+   /*
			
 
				       Convert the Log2 to Log and apply normalization.
			
 
				       We compute (y - normalisation) * (1 / Log2[e]).
			
 
				-      
			
 
				+
			
 
				    */
			
 
				 
			
 
				    /* q11 */
			
 
				    // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
			
 
				    vtmp = vshlq_n_s16(normalization,LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART);
			
 
				-   vtmp = vsubq_s16(y,vtmp);
			
 
				+   vtmp = vsubq_s16((int16x8_t)y,vtmp);
			
 
				+
			
 
				+
			
 
				 
			
 
				-   
			
 
				-  
			
 
				    /* q4.11 */
			
 
				    // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
			
 
				    vtmp = vqdmulhq_n_s16(vtmp,LOG_Q15_INVLOG2EXP);
			
 
				 
			
 
				    return(vtmp);
			
 
				 }
			
 
				-#endif 
			
 
				+#endif
			
 
				 
			
 
				 /**
			
 
				   @ingroup groupFastMath
			
@@ -248,7 +248,7 @@ void arm_vlog_q15(
 
				   blkCnt = blockSize & 7;
			
 
				   #else
			
 
				   blkCnt = blockSize;
			
 
				-  #endif 
			
 
				+  #endif
			
 
				 
			
 
				   while (blkCnt > 0U)
			
 
				   {
			
--- a/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_q31.c
+++ b/CMSIS/DSP/Source/FastMathFunctions/arm_vlog_q31.c
@@ -30,9 +30,9 @@
 
				 
			
 
				 #define LOG_Q31_ACCURACY 31
			
 
				 
			
 
				-/* Bit to represent the normalization factor 
			
 
				+/* Bit to represent the normalization factor
			
 
				    It is Ceiling[Log2[LOG_Q31_ACCURACY]] of the previous value.
			
 
				-   The Log2 algorithm is assuming that the value x is 
			
 
				+   The Log2 algorithm is assuming that the value x is
			
 
				    1 <= x < 2.
			
 
				 
			
 
				    But input value could be as small a 2^-LOG_Q31_ACCURACY
			
@@ -55,7 +55,7 @@
 
				 static uint32_t arm_scalar_log_q31(uint32_t src)
			
 
				 {
			
 
				    int32_t i;
			
 
				-   
			
 
				+
			
 
				    int32_t c = __CLZ(src);
			
 
				    int32_t normalization=0;
			
 
				 
			
@@ -84,7 +84,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
 
				    }
			
 
				    normalization = c;
			
 
				 
			
 
				-   /* Compute the Log2. Result is in q26 
			
 
				+   /* Compute the Log2. Result is in q26
			
 
				       because we know 0 <= y < 1.0 but
			
 
				       do not want to use q32 to allow
			
 
				       following computation with less instructions.
			
@@ -101,10 +101,10 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
 
				       inc = inc >> 1;
			
 
				    }
			
 
				 
			
 
				-   /* 
			
 
				+   /*
			
 
				       Convert the Log2 to Log and apply normalization.
			
 
				       We compute (y - normalisation) * (1 / Log2[e]).
			
 
				-      
			
 
				+
			
 
				    */
			
 
				 
			
 
				    /* q26 */
			
@@ -114,7 +114,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
 
				    /* q5.26 */
			
 
				    y = ((int64_t)tmp * LOG_Q31_INVLOG2EXP) >> 31;
			
 
				 
			
 
				-  
			
 
				+
			
 
				 
			
 
				    return(y);
			
 
				 
			
@@ -125,7 +125,7 @@ static uint32_t arm_scalar_log_q31(uint32_t src)
 
				 
			
 
				 q31x4_t vlogq_q31(q31x4_t src)
			
 
				 {
			
 
				-    
			
 
				+
			
 
				    int32_t i;
			
 
				 
			
 
				    int32x4_t c = vclzq_s32(src);
			
@@ -141,11 +141,11 @@ q31x4_t vlogq_q31(q31x4_t src)
 
				 
			
 
				    /* q11 */
			
 
				    uint32x4_t y = vdupq_n_u32(0);
			
 
				-   
			
 
				+
			
 
				 
			
 
				    /* q11 */
			
 
				    int32x4_t vtmp;
			
 
				-  
			
 
				+
			
 
				 
			
 
				    mve_pred16_t p;
			
 
				 
			
@@ -153,10 +153,10 @@ q31x4_t vlogq_q31(q31x4_t src)
 
				 
			
 
				 
			
 
				    vtmp = vsubq_n_s32(c,1);
			
 
				-   x = vshlq_u32(src,vtmp);
			
 
				+   x = vshlq_u32((uint32x4_t)src,vtmp);
			
 
				 
			
 
				 
			
 
				-    /* Compute the Log2. Result is in Q26 
			
 
				+    /* Compute the Log2. Result is in Q26
			
 
				       because we know 0 <= y < 1.0 but
			
 
				       do not want to use Q32 to allow
			
 
				       following computation with less instructions.
			
@@ -166,7 +166,7 @@ q31x4_t vlogq_q31(q31x4_t src)
 
				       x = vmulhq_u32(x,x);
			
 
				       x = vshlq_n_u32(x,2);
			
 
				 
			
 
				-      
			
 
				+
			
 
				       p = vcmphiq_u32(x,vdupq_n_u32(LOQ_Q31_THRESHOLD));
			
 
				       y = vaddq_m_n_u32(y, y,inc,p);
			
 
				       x = vshrq_m_n_u32(x,x,1,p);
			
@@ -175,19 +175,19 @@ q31x4_t vlogq_q31(q31x4_t src)
 
				    }
			
 
				 
			
 
				 
			
 
				-   /* 
			
 
				+   /*
			
 
				       Convert the Log2 to Log and apply normalization.
			
 
				       We compute (y - normalisation) * (1 / Log2[e]).
			
 
				-      
			
 
				+
			
 
				    */
			
 
				 
			
 
				    /* q11 */
			
 
				    // tmp = (int16_t)y - (normalization << (LOG_Q15_ACCURACY - LOG_Q15_INTEGER_PART));
			
 
				    vtmp = vshlq_n_s32(normalization,LOG_Q31_ACCURACY - LOG_Q31_INTEGER_PART);
			
 
				-   vtmp = vsubq_s32(y,vtmp);
			
 
				+   vtmp = vsubq_s32((int32x4_t)y,vtmp);
			
 
				+
			
 
				+
			
 
				 
			
 
				-   
			
 
				-  
			
 
				    /* q4.11 */
			
 
				    // y = ((int32_t)tmp * LOG_Q15_INVLOG2EXP) >> 15;
			
 
				    vtmp = vqdmulhq_n_s32(vtmp,LOG_Q31_INVLOG2EXP);
			
@@ -242,7 +242,7 @@ void arm_vlog_q31(
 
				   blkCnt = blockSize & 3;
			
 
				   #else
			
 
				   blkCnt = blockSize;
			
 
				-  #endif 
			
 
				+  #endif
			
 
				 
			
 
				   while (blkCnt > 0U)
			
 
				   {
			
@@ -250,7 +250,7 @@ void arm_vlog_q31(
 
				 
			
 
				      blkCnt--;
			
 
				   }
			
 
				- 
			
 
				+
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
+++ b/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -96,7 +96,7 @@ arm_status arm_mat_ldlt_f32(
 
				 {
			
 
				 
			
 
				   arm_status status;                             /* status of matrix inverse */
			
 
				- 
			
 
				+
			
 
				 
			
 
				 #ifdef ARM_MATH_MATRIX_CHECK
			
 
				 
			
@@ -142,7 +142,7 @@ arm_status arm_mat_ldlt_f32(
 
				     {
			
 
				         /* Find pivot */
			
 
				         float32_t m=F32_MIN,a;
			
 
				-        int j=k; 
			
 
				+        int j=k;
			
 
				 
			
 
				 
			
 
				         for(int r=k;r<n;r++)
			
@@ -204,25 +204,25 @@ arm_status arm_mat_ldlt_f32(
 
				              //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
			
 
				 
			
 
				 
			
 
				-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
			
 
				+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
			
 
				              vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
			
 
				 
			
 
				-             
			
 
				+
			
 
				              vecA = vldrwq_z_f32(&pA[(w + 0)*n+x],p0);
			
 
				              vecA = vfmsq_m(vecA, vecW0, vecX, p0);
			
 
				-             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);  
			
 
				+             vstrwq_p(&pA[(w + 0)*n+x], vecA, p0);
			
 
				 
			
 
				              vecA = vldrwq_z_f32(&pA[(w + 1)*n+x],p0);
			
 
				              vecA = vfmsq_m(vecA, vecW1, vecX, p0);
			
 
				-             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);  
			
 
				+             vstrwq_p(&pA[(w + 1)*n+x], vecA, p0);
			
 
				 
			
 
				              vecA = vldrwq_z_f32(&pA[(w + 2)*n+x],p0);
			
 
				              vecA = vfmsq_m(vecA, vecW2, vecX, p0);
			
 
				-             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);  
			
 
				+             vstrwq_p(&pA[(w + 2)*n+x], vecA, p0);
			
 
				 
			
 
				              vecA = vldrwq_z_f32(&pA[(w + 3)*n+x],p0);
			
 
				              vecA = vfmsq_m(vecA, vecW3, vecX, p0);
			
 
				-             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);  
			
 
				+             vstrwq_p(&pA[(w + 3)*n+x], vecA, p0);
			
 
				 
			
 
				              cnt -= 4;
			
 
				           }
			
@@ -246,13 +246,13 @@ arm_status arm_mat_ldlt_f32(
 
				              //pA[w*n+x] = pA[w*n+x] - pA[w*n+k] * (pA[x*n+k] * invA);
			
 
				 
			
 
				              vecA = vldrwq_z_f32(&pA[w*n+x],p0);
			
 
				-             
			
 
				-             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], vecOffs, p0);
			
 
				+
			
 
				+             vecX = vldrwq_gather_shifted_offset_z_f32(&pA[x*n+k], (uint32x4_t)vecOffs, p0);
			
 
				              vecX = vmulq_m_n_f32(vuninitializedq_f32(),vecX,invA,p0);
			
 
				 
			
 
				              vecA = vfmsq_m(vecA, vecW, vecX, p0);
			
 
				 
			
 
				-             vstrwq_p(&pA[w*n+x], vecA, p0);  
			
 
				+             vstrwq_p(&pA[w*n+x], vecA, p0);
			
 
				 
			
 
				              cnt -= 4;
			
 
				           }
			
@@ -263,7 +263,7 @@ arm_status arm_mat_ldlt_f32(
 
				                pA[w*n+k] = pA[w*n+k] * invA;
			
 
				         }
			
 
				 
			
 
				-        
			
 
				+
			
 
				 
			
 
				     }
			
 
				 
			
@@ -275,15 +275,15 @@ arm_status arm_mat_ldlt_f32(
 
				       diag--;
			
 
				       for(int row=0; row < n;row++)
			
 
				       {
			
 
				-        mve_pred16_t p0; 
			
 
				+        mve_pred16_t p0;
			
 
				         int cnt= n-k;
			
 
				         f32x4_t zero=vdupq_n_f32(0.0f);
			
 
				 
			
 
				         for(int col=k; col < n;col += 4)
			
 
				         {
			
 
				            p0 = vctp32q(cnt);
			
 
				-         
			
 
				-           vstrwq_p(&pl->pData[row*n+col], zero, p0);  
			
 
				+
			
 
				+           vstrwq_p(&pl->pData[row*n+col], zero, p0);
			
 
				 
			
 
				            cnt -= 4;
			
 
				         }
			
@@ -292,15 +292,15 @@ arm_status arm_mat_ldlt_f32(
 
				 
			
 
				     for(int row=0; row < n;row++)
			
 
				     {
			
 
				-       mve_pred16_t p0; 
			
 
				+       mve_pred16_t p0;
			
 
				        int cnt= n-row-1;
			
 
				        f32x4_t zero=vdupq_n_f32(0.0f);
			
 
				-       
			
 
				+
			
 
				        for(int col=row+1; col < n;col+=4)
			
 
				        {
			
 
				          p0 = vctp32q(cnt);
			
 
				-         
			
 
				-         vstrwq_p(&pl->pData[row*n+col], zero, p0);  
			
 
				+
			
 
				+         vstrwq_p(&pl->pData[row*n+col], zero, p0);
			
 
				 
			
 
				          cnt -= 4;
			
 
				        }
			
@@ -311,12 +311,12 @@ arm_status arm_mat_ldlt_f32(
 
				       pd->pData[d*n+d] = pl->pData[d*n+d];
			
 
				       pl->pData[d*n+d] = 1.0;
			
 
				     }
			
 
				-  
			
 
				+
			
 
				     status = ARM_MATH_SUCCESS;
			
 
				 
			
 
				   }
			
 
				 
			
 
				-  
			
 
				+
			
 
				   /* Return to application */
			
 
				   return (status);
			
 
				 }
			
@@ -350,7 +350,7 @@ arm_status arm_mat_ldlt_f32(
 
				   @addtogroup MatrixChol
			
 
				   @{
			
 
				  */
			
 
				-  
			
 
				+
			
 
				 /**
			
 
				    * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
			
 
				    * @param[in]  pSrc   points to the instance of the input floating-point matrix structure.
			
@@ -373,7 +373,7 @@ arm_status arm_mat_ldlt_f32(
 
				 {
			
 
				 
			
 
				   arm_status status;                             /* status of matrix inverse */
			
 
				- 
			
 
				+
			
 
				 
			
 
				 #ifdef ARM_MATH_MATRIX_CHECK
			
 
				 
			
@@ -410,7 +410,7 @@ arm_status arm_mat_ldlt_f32(
 
				     {
			
 
				         /* Find pivot */
			
 
				         float32_t m=F32_MIN,a;
			
 
				-        int j=k; 
			
 
				+        int j=k;
			
 
				 
			
 
				 
			
 
				         int r;
			
@@ -457,7 +457,7 @@ arm_status arm_mat_ldlt_f32(
 
				                pA[w*n+k] = pA[w*n+k] / a;
			
 
				         }
			
 
				 
			
 
				-        
			
 
				+
			
 
				 
			
 
				     }
			
 
				 
			
@@ -491,12 +491,12 @@ arm_status arm_mat_ldlt_f32(
 
				       pd->pData[d*n+d] = pl->pData[d*n+d];
			
 
				       pl->pData[d*n+d] = 1.0;
			
 
				     }
			
 
				-  
			
 
				+
			
 
				     status = ARM_MATH_SUCCESS;
			
 
				 
			
 
				   }
			
 
				 
			
 
				-  
			
 
				+
			
 
				   /* Return to application */
			
 
				   return (status);
			
 
				 }
			
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
@@ -65,16 +65,16 @@ void arm_q15_to_float(
 
				 
			
 
				   q15x8_t vecDst;
			
 
				   q15_t const *pSrcVec;
			
 
				-  
			
 
				+
			
 
				   pSrcVec = (q15_t const *) pSrc;
			
 
				   blkCnt = blockSize >> 2;
			
 
				   while (blkCnt > 0U)
			
 
				   {
			
 
				       /* C = (float32_t) A / 32768 */
			
 
				       /* convert from q15 to float and then store the results in the destination buffer */
			
 
				-      vecDst = vldrhq_s32(pSrcVec); 
			
 
				+      vecDst = vldrhq_s32(pSrcVec);
			
 
				       pSrcVec += 4;
			
 
				-      vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15));  
			
 
				+      vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 15));
			
 
				       pDst += 4;
			
 
				       /*
			
 
				        * Decrement the blockSize loop counter
			
@@ -129,7 +129,7 @@ void arm_q15_to_float(
 
				     outV = vcvtq_n_f32_s32(inV1,15);
			
 
				     vst1q_f32(pDst, outV);
			
 
				     pDst += 4;
			
 
				-  
			
 
				+
			
 
				     /* Decrement the loop counter */
			
 
				     blkCnt--;
			
 
				   }
			
--- a/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
+++ b/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
@@ -72,7 +72,7 @@ void arm_q7_to_float(
 
				         /* convert from q7 to float and then store the results in the destination buffer */
			
 
				         vecDst = vldrbq_s32(pSrcVec);    
			
 
				         pSrcVec += 4;
			
 
				-        vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7));   
			
 
				+        vstrwq(pDst, vcvtq_n_f32_s32((int32x4_t)vecDst, 7));   
			
 
				         pDst += 4;
			
 
				         /*
			
 
				          * Decrement the blockSize loop counter
			
--- a/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
+++ b/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
@@ -184,16 +184,16 @@ static void _arm_radix4_butterfly_q15_mve(
 
				         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
			
 
				 
			
 
				         vecTmp0 = vhaddq(vecSum0, vecSum1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         vecTmp0 = vhsubq(vecSum0, vecSum1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         blkCnt--;
			
 
				     }
			
@@ -419,16 +419,16 @@ static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S
 
				         vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
			
 
				 
			
 
				         vecTmp0 = vhaddq(vecSum0, vecSum1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         vecTmp0 = vhsubq(vecSum0, vecSum1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
			
 
				-        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
			
 
				+        vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (int32x4_t) vecTmp0);
			
 
				 
			
 
				         blkCnt--;
			
 
				     }