6 anni fa · 4e4b9ca8bb
--- a/ComputeLibrary/Include/NEMath.h
+++ b/ComputeLibrary/Include/NEMath.h
@@ -69,11 +69,11 @@ static inline float32x4_t vinvq_f32(float32x4_t x);
 
				 /** Perform a 7th degree polynomial approximation using Estrin's method.
			
 
				  *
			
 
				  * @param[in] x      Input vector value in F32 format.
			
 
				- * @param[in] coeffs Polynomial coefficients table.
			
 
				+ * @param[in] coeffs Polynomial coefficients table. (array of flattened float32x4_t vectors)
			
 
				  *
			
 
				  * @return The calculated approximation.
			
 
				  */
			
 
				-static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32x4_t *coeffs);
			
 
				+static inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs);
			
 
				 
			
 
				 /** Calculate exponential
			
 
				  *
			
@@ -180,21 +180,21 @@ static inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
 
				 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
			
 
				 
			
 
				 /** Exponent polynomial coefficients */
			
 
				-extern const float32x4_t exp_tab[8];
			
 
				+extern const float32_t exp_tab[4*8];
			
 
				 
			
 
				 
			
 
				 /** Logarithm polynomial coefficients */
			
 
				-extern const float32x4_t log_tab[8];
			
 
				+extern const float32_t log_tab[4*8];
			
 
				 
			
 
				 #ifndef DOXYGEN_SKIP_THIS
			
 
				 inline float32x4_t vfloorq_f32(float32x4_t val)
			
 
				 {
			
 
				-    static const float32x4_t CONST_1 = {1.f,1.f,1.f,1.f};
			
 
				+    static const float32_t CONST_1[4] = {1.f,1.f,1.f,1.f};
			
 
				 
			
 
				     const int32x4_t   z = vcvtq_s32_f32(val);
			
 
				     const float32x4_t r = vcvtq_f32_s32(z);
			
 
				 
			
 
				-    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r);
			
 
				+    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, vld1q_f32(CONST_1)), r);
			
 
				 }
			
 
				 
			
 
				 inline float32x2_t vinvsqrt_f32(float32x2_t x)
			
@@ -231,12 +231,12 @@ inline float32x4_t vinvq_f32(float32x4_t x)
 
				     return recip;
			
 
				 }
			
 
				 
			
 
				-inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32x4_t *coeffs)
			
 
				+inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32_t *coeffs)
			
 
				 {
			
 
				-    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
			
 
				-    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
			
 
				-    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
			
 
				-    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
			
 
				+    float32x4_t A   = vmlaq_f32(vld1q_f32(&coeffs[4*0]), vld1q_f32(&coeffs[4*4]), x);
			
 
				+    float32x4_t B   = vmlaq_f32(vld1q_f32(&coeffs[4*2]), vld1q_f32(&coeffs[4*6]), x);
			
 
				+    float32x4_t C   = vmlaq_f32(vld1q_f32(&coeffs[4*1]), vld1q_f32(&coeffs[4*5]), x);
			
 
				+    float32x4_t D   = vmlaq_f32(vld1q_f32(&coeffs[4*3]), vld1q_f32(&coeffs[4*7]), x);
			
 
				     float32x4_t x2  = vmulq_f32(x, x);
			
 
				     float32x4_t x4  = vmulq_f32(x2, x2);
			
 
				     float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
			
@@ -245,54 +245,54 @@ inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const float32x4_t *coeffs)
 
				 
			
 
				 inline float32x4_t vexpq_f32(float32x4_t x)
			
 
				 {
			
 
				-    static const float32x4_t CONST_LN2          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
			
 
				-    static const float32x4_t CONST_INV_LN2      = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
			
 
				-    static const float32x4_t CONST_0            = {0.f,0.f,0.f,0.f};
			
 
				-    static const int32x4_t   CONST_NEGATIVE_126 = {-126,-126,-126,-126};
			
 
				+    static const float32_t CONST_LN2[4]          = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
			
 
				+    static const float32_t CONST_INV_LN2[4]      = {1.4426950408f,1.4426950408f,1.4426950408f,1.4426950408f}; // 1/ln(2)
			
 
				+    static const float32_t CONST_0[4]            = {0.f,0.f,0.f,0.f};
			
 
				+    static const int32_t   CONST_NEGATIVE_126[4] = {-126,-126,-126,-126};
			
 
				 
			
 
				     // Perform range reduction [-log(2),log(2)]
			
 
				-    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
			
 
				-    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
			
 
				+    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, vld1q_f32(CONST_INV_LN2)));
			
 
				+    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
			
 
				 
			
 
				     // Polynomial Approximation
			
 
				     float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
			
 
				 
			
 
				     // Reconstruct
			
 
				     poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
			
 
				-    poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly);
			
 
				+    poly = vbslq_f32(vcltq_s32(m, vld1q_s32(CONST_NEGATIVE_126)), vld1q_f32(CONST_0), poly);
			
 
				 
			
 
				     return poly;
			
 
				 }
			
 
				 
			
 
				 inline float32x4_t vlogq_f32(float32x4_t x)
			
 
				 {
			
 
				-    static const int32x4_t   CONST_127 = {127,127,127,127};           // 127
			
 
				-    static const float32x4_t CONST_LN2 = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
			
 
				+    static const int32_t   CONST_127[4] = {127,127,127,127};           // 127
			
 
				+    static const float32_t CONST_LN2[4] = {0.6931471805f,0.6931471805f,0.6931471805f,0.6931471805f}; // ln(2)
			
 
				 
			
 
				     // Extract exponent
			
 
				-    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
			
 
				+    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), vld1q_s32(CONST_127));
			
 
				     float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
			
 
				 
			
 
				     // Polynomial Approximation
			
 
				     float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
			
 
				 
			
 
				     // Reconstruct
			
 
				-    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
			
 
				+    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), vld1q_f32(CONST_LN2));
			
 
				 
			
 
				     return poly;
			
 
				 }
			
 
				 
			
 
				 inline float32x4_t vtanhq_f32(float32x4_t val)
			
 
				 {
			
 
				-    static const float32x4_t CONST_1        = {1.f,1.f,1.f,1.f};
			
 
				-    static const float32x4_t CONST_2        = {2.f,2.f,2.f,2.f};
			
 
				-    static const float32x4_t CONST_MIN_TANH = {-10.f,-10.f,-10.f,-10.f};
			
 
				-    static const float32x4_t CONST_MAX_TANH = {10.f,10.f,10.f,10.f};
			
 
				-
			
 
				-    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
			
 
				-    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
			
 
				-    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
			
 
				-    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
			
 
				+    static const float32_t CONST_1[4]        = {1.f,1.f,1.f,1.f};
			
 
				+    static const float32_t CONST_2[4]        = {2.f,2.f,2.f,2.f};
			
 
				+    static const float32_t CONST_MIN_TANH[4] = {-10.f,-10.f,-10.f,-10.f};
			
 
				+    static const float32_t CONST_MAX_TANH[4] = {10.f,10.f,10.f,10.f};
			
 
				+
			
 
				+    float32x4_t x     = vminq_f32(vmaxq_f32(val, vld1q_f32(CONST_MIN_TANH)), vld1q_f32(CONST_MAX_TANH));
			
 
				+    float32x4_t exp2x = vexpq_f32(vmulq_f32(vld1q_f32(CONST_2), x));
			
 
				+    float32x4_t num   = vsubq_f32(exp2x, vld1q_f32(CONST_1));
			
 
				+    float32x4_t den   = vaddq_f32(exp2x, vld1q_f32(CONST_1));
			
 
				     float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
			
 
				     return tanh;
			
 
				 }
			
@@ -309,12 +309,12 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
 
				 #ifndef DOXYGEN_SKIP_THIS
			
 
				 inline float16x8_t vfloorq_f16(float16x8_t val)
			
 
				 {
			
 
				-    static const float16x8_t CONST_1 = {1.f,1.f,1.f,1.f};
			
 
				+    static const float16_t CONST_1[8] = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
			
 
				 
			
 
				     const int16x8_t   z = vcvtq_s16_f16(val);
			
 
				     const float16x8_t r = vcvtq_f16_s16(z);
			
 
				 
			
 
				-    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r);
			
 
				+    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, vld1q_f16(CONST_1)), r);
			
 
				 }
			
 
				 inline float16x4_t vinvsqrt_f16(float16x4_t x)
			
 
				 {
			
@@ -350,15 +350,15 @@ inline float16x8_t vinvq_f16(float16x8_t x)
 
				 
			
 
				 inline float16x8_t vtanhq_f16(float16x8_t val)
			
 
				 {
			
 
				-    const float16x8_t CONST_1        = {1.f,1.f,1.f,1.f};
			
 
				-    const float16x8_t CONST_2        = {2.f,2.f,2.f,2.f};
			
 
				-    const float16x8_t CONST_MIN_TANH = {-10.f,-10.f,-10.f,-10.f};
			
 
				-    const float16x8_t CONST_MAX_TANH = {10.f,10.f,10.f,10.f};
			
 
				-
			
 
				-    const float16x8_t x     = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH);
			
 
				-    const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x));
			
 
				-    const float16x8_t num   = vsubq_f16(exp2x, CONST_1);
			
 
				-    const float16x8_t den   = vaddq_f16(exp2x, CONST_1);
			
 
				+    const float16_t CONST_1[8]        = {1.f,1.f,1.f,1.f,1.f,1.f,1.f,1.f};
			
 
				+    const float16_t CONST_2[8]        = {2.f,2.f,2.f,2.f,2.f,2.f,2.f,2.f};
			
 
				+    const float16_t CONST_MIN_TANH[8] = {-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f,-10.f};
			
 
				+    const float16_t CONST_MAX_TANH[8] = {10.f,10.f,10.f,10.f,10.f,10.f,10.f,10.f};
			
 
				+
			
 
				+    const float16x8_t x     = vminq_f16(vmaxq_f16(val, vld1q_f16(CONST_MIN_TANH)), vld1q_f16(CONST_MAX_TANH));
			
 
				+    const float16x8_t exp2x = vexpq_f16(vmulq_f16(vld1q_f16(CONST_2), x));
			
 
				+    const float16x8_t num   = vsubq_f16(exp2x, vld1q_f16(CONST_1));
			
 
				+    const float16x8_t den   = vaddq_f16(exp2x, vld1q_f16(CONST_1));
			
 
				     const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
			
 
				     return tanh;
			
 
				 }
			
@@ -411,4 +411,4 @@ inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
 
				 #endif /* DOXYGEN_SKIP_THIS */
			
 
				 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
			
 
				 #endif
			
 
				-#endif /* __ARM_COMPUTE_NEMATH_H__ */
			
 
				+#endif /* __ARM_COMPUTE_NEMATH_H__ */
			
--- a/ComputeLibrary/Source/arm_cl_tables.c
+++ b/ComputeLibrary/Source/arm_cl_tables.c
@@ -27,29 +27,29 @@
 
				 #if defined(ARM_MATH_NEON)
			
 
				 
			
 
				 /** Exponent polynomial coefficients */
			
 
				-const float32x4_t exp_tab[8] =
			
 
				+const float32_t exp_tab[4*8] =
			
 
				 {
			
 
				-        {1.f,1.f,1.f,1.f},
			
 
				-        {0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f},
			
 
				-        {0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f},
			
 
				-        {0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f},
			
 
				-        {1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f},
			
 
				-        {0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f},
			
 
				-        {0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f},
			
 
				-        {0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f}
			
 
				+        1.f,1.f,1.f,1.f,
			
 
				+        0.0416598916054f,0.0416598916054f,0.0416598916054f,0.0416598916054f,
			
 
				+        0.500000596046f,0.500000596046f,0.500000596046f,0.500000596046f,
			
 
				+        0.0014122662833f,0.0014122662833f,0.0014122662833f,0.0014122662833f,
			
 
				+        1.00000011921f,1.00000011921f,1.00000011921f,1.00000011921f,
			
 
				+        0.00833693705499f,0.00833693705499f,0.00833693705499f,0.00833693705499f,
			
 
				+        0.166665703058f,0.166665703058f,0.166665703058f,0.166665703058f,
			
 
				+        0.000195780929062f,0.000195780929062f,0.000195780929062f,0.000195780929062f
			
 
				 };
			
 
				 
			
 
				 /** Logarithm polynomial coefficients */
			
 
				-const float32x4_t log_tab[8] =
			
 
				+const float32_t log_tab[4*8] =
			
 
				 {
			
 
				-        {-2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f},
			
 
				-        {-2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f},
			
 
				-        {-5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f},
			
 
				-        {-0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f},
			
 
				-        {5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f},
			
 
				-        {0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f},
			
 
				-        {4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f},
			
 
				-        {0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f},
			
 
				+        -2.29561495781f,-2.29561495781f,-2.29561495781f,-2.29561495781f,
			
 
				+        -2.47071170807f,-2.47071170807f,-2.47071170807f,-2.47071170807f,
			
 
				+        -5.68692588806f,-5.68692588806f,-5.68692588806f,-5.68692588806f,
			
 
				+        -0.165253549814f,-0.165253549814f,-0.165253549814f,-0.165253549814f,
			
 
				+        5.17591238022f,5.17591238022f,5.17591238022f,5.17591238022f,
			
 
				+        0.844007015228f,0.844007015228f,0.844007015228f,0.844007015228f,
			
 
				+        4.58445882797f,4.58445882797f,4.58445882797f,4.58445882797f,
			
 
				+        0.0141278216615f,0.0141278216615f,0.0141278216615f,0.0141278216615f
			
 
				 };
			
 
				 
			
 
				-#endif
			
 
				+#endif
			
--- a/Include/arm_math.h
+++ b/Include/arm_math.h
@@ -93,8 +93,10 @@
 
				    * Toolchain Support
			
 
				    * ------------
			
 
				    *
			
 
				-   * The library has been developed and tested with MDK version 5.14.0.0
			
 
				-   * The library is being tested in GCC and IAR toolchains and updates on this activity will be made available shortly.
			
 
				+   * The library is now tested on Fast Models building with cmake.
			
 
				+   * Core M0, M7, A5 are tested.
			
 
				+   * 
			
 
				+   * 
			
 
				    *
			
 
				    * Building the Library
			
 
				    * ------------
			
@@ -140,6 +142,23 @@
 
				    * of some DSP functions. Experimental Neon versions currently do not have better
			
 
				    * performances than the scalar versions.
			
 
				    *
			
 
				+   * - ARM_MATH_HELIUM:
			
 
				+   *
			
 
				+   * It implies the flags ARM_MATH_MVEF and ARM_MATH_MVEI and ARM_MATH_FLOAT16.
			
 
				+   *
			
 
				+   * - ARM_MATH_MVEF:
			
 
				+   *
			
 
				+   * Select Helium versions of the f32 algorithms.
			
 
				+   * It implies ARM_MATH_FLOAT16 and ARM_MATH_MVEI.
			
 
				+   *
			
 
				+   * - ARM_MATH_MVEI:
			
 
				+   *
			
 
				+   * Select Helium versions of the int and fixed point algorithms.
			
 
				+   *
			
 
				+   * - ARM_MATH_FLOAT16:
			
 
				+   *
			
 
				+   * Float16 implementations of some algorithms (Requires MVE extension).
			
 
				+   *
			
 
				    * <hr>
			
 
				    * CMSIS-DSP in ARM::CMSIS Pack
			
 
				    * -----------------------------
			
--- a/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h
+++ b/Platforms/FVP/ARMCA5/LinkScripts/GCC/mem_ARMCA5.h
@@ -44,7 +44,7 @@
 
				 // </h>
			
 
				  *----------------------------------------------------------------------------*/
			
 
				 #define __ROM_BASE       0x80000000
			
 
				-#define __ROM_SIZE       0x00200000
			
 
				+#define __ROM_SIZE       0x00400000
			
 
				 
			
 
				 /*--------------------- RAM Configuration -----------------------------------
			
 
				 // <h> RAM Configuration
			
@@ -68,7 +68,7 @@
 
				 //   </h>
			
 
				 // </h>
			
 
				  *----------------------------------------------------------------------------*/
			
 
				-#define __RAM_BASE       0x80200000
			
 
				+#define __RAM_BASE       0x80400000
			
 
				 #define __RAM_SIZE       0x00300000
			
 
				 
			
 
				 #define __RW_DATA_SIZE   0x00100000
			
@@ -94,7 +94,7 @@
 
				 //   <o1> TTB Size (in Bytes) <0x0-0xFFFFFFFF:8>
			
 
				 // </h>
			
 
				  *----------------------------------------------------------------------------*/
			
 
				-#define __TTB_BASE       0x80500000
			
 
				+#define __TTB_BASE       0x80800000
			
 
				 #define __TTB_SIZE       0x00005000
			
 
				 
			
 
				 #endif /* __MEM_ARMCA5_H */
			
--- a/Platforms/FVP/ARMCA5/Startup/AC5/startup_ARMCA5.c
+++ b/Platforms/FVP/ARMCA5/Startup/AC5/startup_ARMCA5.c
@@ -41,8 +41,8 @@
 
				 /*----------------------------------------------------------------------------
			
 
				   Internal References
			
 
				  *----------------------------------------------------------------------------*/
			
 
				-void Vectors       (void) __attribute__ ((naked, section("RESET")));
			
 
				-void Reset_Handler (void) __attribute__ ((naked));
			
 
				+void Vectors       (void) __attribute__ ((section("RESET")));
			
 
				+void Reset_Handler (void);
			
 
				 
			
 
				 /*----------------------------------------------------------------------------
			
 
				   Exception / Interrupt Handler
			
@@ -59,14 +59,14 @@ void FIQ_Handler   (void) __attribute__ ((weak, alias("Default_Handler")));
 
				  *----------------------------------------------------------------------------*/
			
 
				 void Vectors(void) {
			
 
				   __ASM volatile(
			
 
				-  "LDR    PC, =Reset_Handler                        \n"
			
 
				-  "LDR    PC, =Undef_Handler                        \n"
			
 
				-  "LDR    PC, =SVC_Handler                          \n"
			
 
				-  "LDR    PC, =PAbt_Handler                         \n"
			
 
				-  "LDR    PC, =DAbt_Handler                         \n"
			
 
				+  "LDR    __current_pc, =Reset_Handler                        \n"
			
 
				+  "LDR    __current_pc, =Undef_Handler                        \n"
			
 
				+  "LDR    __current_pc, =SVC_Handler                          \n"
			
 
				+  "LDR    __current_pc, =PAbt_Handler                         \n"
			
 
				+  "LDR    __current_pc, =DAbt_Handler                         \n"
			
 
				   "NOP                                              \n"
			
 
				-  "LDR    PC, =IRQ_Handler                          \n"
			
 
				-  "LDR    PC, =FIQ_Handler                          \n"
			
 
				+  "LDR    __current_pc, =IRQ_Handler                          \n"
			
 
				+  "LDR    __current_pc, =FIQ_Handler                          \n"
			
 
				   );
			
 
				 }
			
 
				 
			
--- a/Platforms/FVP/ARMCM0/LinkScripts/AC5/lnk.sct
+++ b/Platforms/FVP/ARMCM0/LinkScripts/AC5/lnk.sct
@@ -14,7 +14,7 @@
 
				 ; </h>
			
 
				  *----------------------------------------------------------------------------*/
			
 
				 #define __ROM_BASE     0x00000000
			
 
				-#define __ROM_SIZE     0x00200000
			
 
				+#define __ROM_SIZE     0x00300000
			
 
				 
			
 
				 /*--------------------- Embedded RAM Configuration ---------------------------
			
 
				 ; <h> RAM Configuration
			
--- a/Platforms/FVP/ARMCM0/LinkScripts/GCC/lnk.ld
+++ b/Platforms/FVP/ARMCM0/LinkScripts/GCC/lnk.ld
@@ -0,0 +1,296 @@
 
				+/******************************************************************************
			
 
				+ * @file     gcc_arm.ld
			
 
				+ * @brief    GNU Linker Script for Cortex-M based device
			
 
				+ * @version  V2.0.0
			
 
				+ * @date     21. May 2019
			
 
				+ ******************************************************************************/
			
 
				+/*
			
 
				+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the License); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
			
 
				+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+#include "mem_ARMCM0.h" 
			
 
				+/*
			
 
				+ *-------- <<< Use Configuration Wizard in Context Menu >>> -------------------
			
 
				+ */
			
 
				+
			
 
				+/*---------------------- Flash Configuration ----------------------------------
			
 
				+  <h> Flash Configuration
			
 
				+    <o0> Flash Base Address <0x0-0xFFFFFFFF:8>
			
 
				+    <o1> Flash Size (in Bytes) <0x0-0xFFFFFFFF:8>
			
 
				+  </h>
			
 
				+  -----------------------------------------------------------------------------*/
			
 
				+__ROM_BASE = 0x00000000;
			
 
				+__ROM_SIZE = 0x00400000;
			
 
				+
			
 
				+/*--------------------- Embedded RAM Configuration ----------------------------
			
 
				+  <h> RAM Configuration
			
 
				+    <o0> RAM Base Address    <0x0-0xFFFFFFFF:8>
			
 
				+    <o1> RAM Size (in Bytes) <0x0-0xFFFFFFFF:8>
			
 
				+  </h>
			
 
				+ -----------------------------------------------------------------------------*/
			
 
				+__RAM_BASE = 0x20000000;
			
 
				+__RAM_SIZE = 0x00300000;
			
 
				+
			
 
				+/*--------------------- Stack / Heap Configuration ----------------------------
			
 
				+  <h> Stack / Heap Configuration
			
 
				+    <o0> Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
			
 
				+    <o1> Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
			
 
				+  </h>
			
 
				+  -----------------------------------------------------------------------------*/
			
 
				+__STACK_SIZE = STACK_SIZE;
			
 
				+__HEAP_SIZE  = HEAP_SIZE;
			
 
				+
			
 
				+/*
			
 
				+ *-------------------- <<< end of configuration section >>> -------------------
			
 
				+ */
			
 
				+
			
 
				+MEMORY
			
 
				+{
			
 
				+  FLASH (rx)  : ORIGIN = __ROM_BASE, LENGTH = __ROM_SIZE
			
 
				+  RAM   (rwx) : ORIGIN = __RAM_BASE, LENGTH = __RAM_SIZE
			
 
				+}
			
 
				+
			
 
				+/* Linker script to place sections and symbol values. Should be used together
			
 
				+ * with other linker script that defines memory regions FLASH and RAM.
			
 
				+ * It references following symbols, which must be defined in code:
			
 
				+ *   Reset_Handler : Entry of reset handler
			
 
				+ *
			
 
				+ * It defines following symbols, which code can use without definition:
			
 
				+ *   __exidx_start
			
 
				+ *   __exidx_end
			
 
				+ *   __copy_table_start__
			
 
				+ *   __copy_table_end__
			
 
				+ *   __zero_table_start__
			
 
				+ *   __zero_table_end__
			
 
				+ *   __etext
			
 
				+ *   __data_start__
			
 
				+ *   __preinit_array_start
			
 
				+ *   __preinit_array_end
			
 
				+ *   __init_array_start
			
 
				+ *   __init_array_end
			
 
				+ *   __fini_array_start
			
 
				+ *   __fini_array_end
			
 
				+ *   __data_end__
			
 
				+ *   __bss_start__
			
 
				+ *   __bss_end__
			
 
				+ *   __end__
			
 
				+ *   end
			
 
				+ *   __HeapLimit
			
 
				+ *   __StackLimit
			
 
				+ *   __StackTop
			
 
				+ *   __stack
			
 
				+ */
			
 
				+ENTRY(Reset_Handler)
			
 
				+
			
 
				+SECTIONS
			
 
				+{
			
 
				+  .text :
			
 
				+  {
			
 
				+    KEEP(*(.vectors))
			
 
				+    *(.text*)
			
 
				+
			
 
				+    KEEP(*(.init))
			
 
				+    KEEP(*(.fini))
			
 
				+
			
 
				+    /* .ctors */
			
 
				+    *crtbegin.o(.ctors)
			
 
				+    *crtbegin?.o(.ctors)
			
 
				+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .ctors)
			
 
				+    *(SORT(.ctors.*))
			
 
				+    *(.ctors)
			
 
				+
			
 
				+    /* .dtors */
			
 
				+    *crtbegin.o(.dtors)
			
 
				+    *crtbegin?.o(.dtors)
			
 
				+    *(EXCLUDE_FILE(*crtend?.o *crtend.o) .dtors)
			
 
				+    *(SORT(.dtors.*))
			
 
				+    *(.dtors)
			
 
				+
			
 
				+    *(.rodata*)
			
 
				+
			
 
				+    KEEP(*(.eh_frame*))
			
 
				+  } > FLASH
			
 
				+
			
 
				+  /*
			
 
				+   * SG veneers:
			
 
				+   * All SG veneers are placed in the special output section .gnu.sgstubs. Its start address
			
 
				+   * must be set, either with the command line option ‘--section-start’ or in a linker script,
			
 
				+   * to indicate where to place these veneers in memory.
			
 
				+   */
			
 
				+/*
			
 
				+  .gnu.sgstubs :
			
 
				+  {
			
 
				+    . = ALIGN(32);
			
 
				+  } > FLASH
			
 
				+*/
			
 
				+  .ARM.extab :
			
 
				+  {
			
 
				+    *(.ARM.extab* .gnu.linkonce.armextab.*)
			
 
				+  } > FLASH
			
 
				+
			
 
				+  __exidx_start = .;
			
 
				+  .ARM.exidx :
			
 
				+  {
			
 
				+    *(.ARM.exidx* .gnu.linkonce.armexidx.*)
			
 
				+  } > FLASH
			
 
				+  __exidx_end = .;
			
 
				+
			
 
				+  .copy.table :
			
 
				+  {
			
 
				+    . = ALIGN(4);
			
 
				+    __copy_table_start__ = .;
			
 
				+    LONG (__etext)
			
 
				+    LONG (__data_start__)
			
 
				+    LONG (__data_end__ - __data_start__)
			
 
				+    /* Add each additional data section here */
			
 
				+/*
			
 
				+    LONG (__etext2)
			
 
				+    LONG (__data2_start__)
			
 
				+    LONG (__data2_end__ - __data2_start__)
			
 
				+*/
			
 
				+    __copy_table_end__ = .;
			
 
				+  } > FLASH
			
 
				+
			
 
				+  .zero.table :
			
 
				+  {
			
 
				+    . = ALIGN(4);
			
 
				+    __zero_table_start__ = .;
			
 
				+    /* Add each additional bss section here */
			
 
				+/*
			
 
				+    LONG (__bss2_start__)
			
 
				+    LONG (__bss2_end__ - __bss2_start__)
			
 
				+*/
			
 
				+    __zero_table_end__ = .;
			
 
				+  } > FLASH
			
 
				+
			
 
				+  /**
			
 
				+   * Location counter can end up 2byte aligned with narrow Thumb code but
			
 
				+   * __etext is assumed by startup code to be the LMA of a section in RAM
			
 
				+   * which must be 4byte aligned 
			
 
				+   */
			
 
				+  __etext = ALIGN (4);
			
 
				+
			
 
				+  .data : AT (__etext)
			
 
				+  {
			
 
				+    __data_start__ = .;
			
 
				+    *(vtable)
			
 
				+    *(.data)
			
 
				+    *(.data.*)
			
 
				+
			
 
				+    . = ALIGN(4);
			
 
				+    /* preinit data */
			
 
				+    PROVIDE_HIDDEN (__preinit_array_start = .);
			
 
				+    KEEP(*(.preinit_array))
			
 
				+    PROVIDE_HIDDEN (__preinit_array_end = .);
			
 
				+
			
 
				+    . = ALIGN(4);
			
 
				+    /* init data */
			
 
				+    PROVIDE_HIDDEN (__init_array_start = .);
			
 
				+    KEEP(*(SORT(.init_array.*)))
			
 
				+    KEEP(*(.init_array))
			
 
				+    PROVIDE_HIDDEN (__init_array_end = .);
			
 
				+
			
 
				+
			
 
				+    . = ALIGN(4);
			
 
				+    /* finit data */
			
 
				+    PROVIDE_HIDDEN (__fini_array_start = .);
			
 
				+    KEEP(*(SORT(.fini_array.*)))
			
 
				+    KEEP(*(.fini_array))
			
 
				+    PROVIDE_HIDDEN (__fini_array_end = .);
			
 
				+
			
 
				+    KEEP(*(.jcr*))
			
 
				+    . = ALIGN(4);
			
 
				+    /* All data end */
			
 
				+    __data_end__ = .;
			
 
				+
			
 
				+  } > RAM
			
 
				+
			
 
				+  /*
			
 
				+   * Secondary data section, optional
			
 
				+   *
			
 
				+   * Remember to add each additional data section
			
 
				+   * to the .copy.table above to asure proper
			
 
				+   * initialization during startup.
			
 
				+   */
			
 
				+/*
			
 
				+  __etext2 = ALIGN (4);
			
 
				+
			
 
				+  .data2 : AT (__etext2)
			
 
				+  {
			
 
				+    . = ALIGN(4);
			
 
				+    __data2_start__ = .;
			
 
				+    *(.data2)
			
 
				+    *(.data2.*)
			
 
				+    . = ALIGN(4);
			
 
				+    __data2_end__ = .;
			
 
				+
			
 
				+  } > RAM2
			
 
				+*/
			
 
				+
			
 
				+  .bss :
			
 
				+  {
			
 
				+    . = ALIGN(4);
			
 
				+    __bss_start__ = .;
			
 
				+    *(.bss)
			
 
				+    *(.bss.*)
			
 
				+    *(COMMON)
			
 
				+    . = ALIGN(4);
			
 
				+    __bss_end__ = .;
			
 
				+  } > RAM AT > RAM
			
 
				+
			
 
				+  /*
			
 
				+   * Secondary bss section, optional
			
 
				+   *
			
 
				+   * Remember to add each additional bss section
			
 
				+   * to the .zero.table above to asure proper
			
 
				+   * initialization during startup.
			
 
				+   */
			
 
				+/*
			
 
				+  .bss2 :
			
 
				+  {
			
 
				+    . = ALIGN(4);
			
 
				+    __bss2_start__ = .;
			
 
				+    *(.bss2)
			
 
				+    *(.bss2.*)
			
 
				+    . = ALIGN(4);
			
 
				+    __bss2_end__ = .;
			
 
				+  } > RAM2 AT > RAM2
			
 
				+*/
			
 
				+
			
 
				+  .heap (COPY) :
			
 
				+  {
			
 
				+    . = ALIGN(8);
			
 
				+    __end__ = .;
			
 
				+    __HeapBase = .;
			
 
				+    PROVIDE(end = .);
			
 
				+    . = . + __HEAP_SIZE;
			
 
				+    . = ALIGN(8);
			
 
				+    __HeapLimit = .;
			
 
				+  } > RAM
			
 
				+
			
 
				+  .stack (ORIGIN(RAM) + LENGTH(RAM) - __STACK_SIZE) (COPY) :
			
 
				+  {
			
 
				+    . = ALIGN(8);
			
 
				+    __StackLimit = .;
			
 
				+    . = . + __STACK_SIZE;
			
 
				+    . = ALIGN(8);
			
 
				+    __StackTop = .;
			
 
				+  } > RAM
			
 
				+  PROVIDE(__stack = __StackTop);
			
 
				+
			
 
				+  /* Check if data + heap + stack exceeds RAM limit */
			
 
				+  ASSERT(__StackLimit >= __HeapLimit, "region RAM overflowed with stack")
			
 
				+}
			
--- a/Platforms/FVP/ARMCM0/LinkScripts/GCC/mem_ARMCM0.h
+++ b/Platforms/FVP/ARMCM0/LinkScripts/GCC/mem_ARMCM0.h
@@ -0,0 +1,38 @@
 
				+/**************************************************************************//**
			
 
				+ * @file     mem_ARMCM0.h
			
 
				+ * @brief    Memory base and size definitions (used in scatter file)
			
 
				+ * @version  V1.1.0
			
 
				+ * @date     15. May 2019
			
 
				+ *
			
 
				+ * @note
			
 
				+ *
			
 
				+ ******************************************************************************/
			
 
				+/*
			
 
				+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the License); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
			
 
				+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __MEM_ARMCM0_H
			
 
				+#define __MEM_ARMCM0_H
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define STACK_SIZE     0x00003000
			
 
				+#define HEAP_SIZE      0x00100000
			
 
				+
			
 
				+
			
 
				+
			
 
				+#endif /* __MEM_ARMCM7_H */
			
--- a/Platforms/FVP/ARMCM0/Startup/GCC/startup_ARMCM0.S
+++ b/Platforms/FVP/ARMCM0/Startup/GCC/startup_ARMCM0.S
@@ -0,0 +1,179 @@
 
				+/**************************************************************************//**
			
 
				+ * @file     startup_ARMCM0.S
			
 
				+ * @brief    CMSIS-Core(M) Device Startup File for Cortex-M0 Device
			
 
				+ * @version  V2.0.1
			
 
				+ * @date     23. July 2019
			
 
				+ ******************************************************************************/
			
 
				+/*
			
 
				+ * Copyright (c) 2009-2019 Arm Limited. All rights reserved.
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the License); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
			
 
				+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+                .syntax  unified
			
 
				+                .arch    armv6-m
			
 
				+
			
 
				+                .section .vectors
			
 
				+                .align   2
			
 
				+                .globl   __Vectors
			
 
				+                .globl   __Vectors_End
			
 
				+                .globl   __Vectors_Size
			
 
				+__Vectors:
			
 
				+                .long    __StackTop                         /*     Top of Stack */
			
 
				+                .long    Reset_Handler                      /*     Reset Handler */
			
 
				+                .long    NMI_Handler                        /* -14 NMI Handler */
			
 
				+                .long    HardFault_Handler                  /* -13 Hard Fault Handler */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    SVC_Handler                        /*  -5 SVCall Handler */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    0                                  /*     Reserved */
			
 
				+                .long    PendSV_Handler                     /*  -2 PendSV Handler */
			
 
				+                .long    SysTick_Handler                    /*  -1 SysTick Handler */
			
 
				+
			
 
				+                /* Interrupts */
			
 
				+                .long    Interrupt0_Handler                 /*   0 Interrupt 0 */
			
 
				+                .long    Interrupt1_Handler                 /*   1 Interrupt 1 */
			
 
				+                .long    Interrupt2_Handler                 /*   2 Interrupt 2 */
			
 
				+                .long    Interrupt3_Handler                 /*   3 Interrupt 3 */
			
 
				+                .long    Interrupt4_Handler                 /*   4 Interrupt 4 */
			
 
				+                .long    Interrupt5_Handler                 /*   5 Interrupt 5 */
			
 
				+                .long    Interrupt6_Handler                 /*   6 Interrupt 6 */
			
 
				+                .long    Interrupt7_Handler                 /*   7 Interrupt 7 */
			
 
				+                .long    Interrupt8_Handler                 /*   8 Interrupt 8 */
			
 
				+                .long    Interrupt9_Handler                 /*   9 Interrupt 9 */
			
 
				+
			
 
				+                .space   ( 22 * 4)                          /* Interrupts 10 .. 31 are left out */
			
 
				+__Vectors_End:
			
 
				+                .equ     __Vectors_Size, __Vectors_End - __Vectors
			
 
				+                .size    __Vectors, . - __Vectors
			
 
				+
			
 
				+
			
 
				+                .thumb
			
 
				+                .section .text
			
 
				+                .align   2
			
 
				+
			
 
				+                .thumb_func
			
 
				+                .type    Reset_Handler, %function
			
 
				+                .globl   Reset_Handler
			
 
				+                .fnstart
			
 
				+Reset_Handler:
			
 
				+                bl       SystemInit
			
 
				+
			
 
				+                ldr      r4, =__copy_table_start__
			
 
				+                ldr      r5, =__copy_table_end__
			
 
				+
			
 
				+.L_loop0:
			
 
				+                cmp      r4, r5
			
 
				+                bge      .L_loop0_done
			
 
				+                ldr      r1, [r4]
			
 
				+                ldr      r2, [r4, #4]
			
 
				+                ldr      r3, [r4, #8]
			
 
				+
			
 
				+.L_loop0_0:
			
 
				+                subs     r3, #4
			
 
				+                blt      .L_loop0_0_done
			
 
				+                ldr      r0, [r1, r3]
			
 
				+                str      r0, [r2, r3]
			
 
				+                b        .L_loop0_0
			
 
				+
			
 
				+.L_loop0_0_done:
			
 
				+                adds     r4, #12
			
 
				+                b        .L_loop0
			
 
				+
			
 
				+.L_loop0_done:
			
 
				+
			
 
				+                ldr      r3, =__zero_table_start__
			
 
				+                ldr      r4, =__zero_table_end__
			
 
				+
			
 
				+.L_loop2:
			
 
				+                cmp      r3, r4
			
 
				+                bge      .L_loop2_done
			
 
				+                ldr      r1, [r3]
			
 
				+                ldr      r2, [r3, #4]
			
 
				+                movs     r0, 0
			
 
				+
			
 
				+.L_loop2_0:
			
 
				+                subs     r2, #4
			
 
				+                blt      .L_loop2_0_done
			
 
				+                str      r0, [r1, r2]
			
 
				+                b        .L_loop2_0
			
 
				+.L_loop2_0_done:
			
 
				+
			
 
				+                adds     r3, #8
			
 
				+                b        .L_loop2
			
 
				+.L_loop2_done:
			
 
				+
			
 
				+                bl       _start
			
 
				+
			
 
				+                .fnend
			
 
				+                .size    Reset_Handler, . - Reset_Handler
			
 
				+
			
 
				+/* The default macro is not used for HardFault_Handler
			
 
				+ * because this results in a poor debug illusion.
			
 
				+ */
			
 
				+                .thumb_func
			
 
				+                .type    HardFault_Handler, %function
			
 
				+                .weak    HardFault_Handler
			
 
				+                .fnstart
			
 
				+HardFault_Handler:
			
 
				+                b        .
			
 
				+                .fnend
			
 
				+                .size    HardFault_Handler, . - HardFault_Handler
			
 
				+
			
 
				+                .thumb_func
			
 
				+                .type    Default_Handler, %function
			
 
				+                .weak    Default_Handler
			
 
				+                .fnstart
			
 
				+Default_Handler:
			
 
				+                b        .
			
 
				+                .fnend
			
 
				+                .size    Default_Handler, . - Default_Handler
			
 
				+
			
 
				+/* Macro to define default exception/interrupt handlers.
			
 
				+ * Default handler are weak symbols with an endless loop.
			
 
				+ * They can be overwritten by real handlers.
			
 
				+ */
			
 
				+                .macro   Set_Default_Handler  Handler_Name
			
 
				+                .weak    \Handler_Name
			
 
				+                .set     \Handler_Name, Default_Handler
			
 
				+                .endm
			
 
				+
			
 
				+
			
 
				+/* Default exception/interrupt handler */
			
 
				+
			
 
				+                Set_Default_Handler  NMI_Handler
			
 
				+                Set_Default_Handler  SVC_Handler
			
 
				+                Set_Default_Handler  PendSV_Handler
			
 
				+                Set_Default_Handler  SysTick_Handler
			
 
				+
			
 
				+                Set_Default_Handler  Interrupt0_Handler
			
 
				+                Set_Default_Handler  Interrupt1_Handler
			
 
				+                Set_Default_Handler  Interrupt2_Handler
			
 
				+                Set_Default_Handler  Interrupt3_Handler
			
 
				+                Set_Default_Handler  Interrupt4_Handler
			
 
				+                Set_Default_Handler  Interrupt5_Handler
			
 
				+                Set_Default_Handler  Interrupt6_Handler
			
 
				+                Set_Default_Handler  Interrupt7_Handler
			
 
				+                Set_Default_Handler  Interrupt8_Handler
			
 
				+                Set_Default_Handler  Interrupt9_Handler
			
 
				+
			
 
				+
			
 
				+                .end
			
--- a/Platforms/FVP/ARMCM0/Startup/GCC/support.c
+++ b/Platforms/FVP/ARMCM0/Startup/GCC/support.c
@@ -0,0 +1,36 @@
 
				+
			
 
				+#ifdef   __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+    
			
 
				+char * _sbrk(int incr);
			
 
				+
			
 
				+void __malloc_lock() ;
			
 
				+void __malloc_unlock();
			
 
				+
			
 
				+char __HeapBase, __HeapLimit;  // make sure to define these symbols in linker command file
			
 
				+#ifdef   __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static int totalBytesProvidedBySBRK = 0;
			
 
				+
			
 
				+//! sbrk/_sbrk version supporting reentrant newlib (depends upon above symbols defined by linker control file).
			
 
				+char * sbrk(int incr) {
			
 
				+    static char *currentHeapEnd = &__HeapBase;
			
 
				+    char *previousHeapEnd = currentHeapEnd;
			
 
				+    if (currentHeapEnd + incr > &__HeapLimit) {
			
 
				+        return (char *)-1; // the malloc-family routine that called sbrk will return 0
			
 
				+    }
			
 
				+    currentHeapEnd += incr;
			
 
				+    
			
 
				+    totalBytesProvidedBySBRK += incr;
			
 
				+    
			
 
				+    return (char *) previousHeapEnd;
			
 
				+}
			
 
				+//! Synonym for sbrk.
			
 
				+char * _sbrk(int incr) { return sbrk(incr); };
			
 
				+
			
 
				+void __malloc_lock()     {       };
			
 
				+void __malloc_unlock()   {  };
			
--- a/Platforms/FVP/ARMCM7/LinkScripts/AC5/lnk.sct
+++ b/Platforms/FVP/ARMCM7/LinkScripts/AC5/lnk.sct
@@ -14,7 +14,7 @@
 
				 ; </h>
			
 
				  *----------------------------------------------------------------------------*/
			
 
				 #define __ROM_BASE     0x00000000
			
 
				-#define __ROM_SIZE     0x00200000
			
 
				+#define __ROM_SIZE     0x00300000
			
 
				 
			
 
				 /*--------------------- Embedded RAM Configuration ---------------------------
			
 
				 ; <h> RAM Configuration
			
--- a/Platforms/FVP/ARMCM7/LinkScripts/GCC/lnk.ld
+++ b/Platforms/FVP/ARMCM7/LinkScripts/GCC/lnk.ld
@@ -33,7 +33,7 @@
 
				   </h>
			
 
				   -----------------------------------------------------------------------------*/
			
 
				 __ROM_BASE = 0x00000000;
			
 
				-__ROM_SIZE = 0x00300000;
			
 
				+__ROM_SIZE = 0x00400000;
			
 
				 
			
 
				 /*--------------------- Embedded RAM Configuration ----------------------------
			
 
				   <h> RAM Configuration
			
--- a/Source/BasicMathFunctions/arm_dot_prod_f32.c
+++ b/Source/BasicMathFunctions/arm_dot_prod_f32.c
@@ -131,7 +131,8 @@ void arm_dot_prod_f32(
 
				 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
			
 
				     f32x4_t vec1;
			
 
				     f32x4_t vec2;
			
 
				-    f32x4_t accum = vdupq_n_f32(0);    
			
 
				+    f32x4_t accum = vdupq_n_f32(0);   
			
 
				+    f32x2_t tmp = vdup_n_f32(0);    
			
 
				 
			
 
				     /* Compute 4 outputs at a time */
			
 
				     blkCnt = blockSize >> 2U;
			
@@ -160,7 +161,9 @@ void arm_dot_prod_f32(
 
				 #if __aarch64__
			
 
				     sum = vpadds_f32(vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)));
			
 
				 #else
			
 
				-    sum = (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[0] + (vpadd_f32(vget_low_f32(accum), vget_high_f32(accum)))[1];
			
 
				+    tmp = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
			
 
				+    sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
			
 
				+
			
 
				 #endif    
			
 
				 
			
 
				     /* Tail */
			
--- a/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
+++ b/Source/BayesFunctions/arm_gaussian_naive_bayes_predict_f32.c
@@ -228,10 +228,10 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
 
				            vecBlkCnt--;
			
 
				         }
			
 
				         tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
			
 
				-        tmp += tmpV2[0] + tmpV2[1];
			
 
				-
			
 
				+        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
			
 
				+         
			
 
				         tmpV2 = vpadd_f32(vget_low_f32(tmpV1),vget_high_f32(tmpV1));
			
 
				-        tmp1 += tmpV2[0] + tmpV2[1];
			
 
				+        tmp1 += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
			
 
				 
			
 
				         vecBlkCnt = S->vectorDimension & 3;
			
 
				         while(vecBlkCnt > 0)
			
@@ -300,7 +300,7 @@ uint32_t arm_gaussian_naive_bayes_predict_f32(const arm_gaussian_naive_bayes_ins
 
				            vecBlkCnt--;
			
 
				         }
			
 
				         tmpV2 = vpadd_f32(vget_low_f32(tmpV),vget_high_f32(tmpV));
			
 
				-        tmp += tmpV2[0] + tmpV2[1];
			
 
				+        tmp += vget_lane_f32(tmpV2, 0) + vget_lane_f32(tmpV2, 1);
			
 
				 
			
 
				         vecBlkCnt = S->vectorDimension & 3;
			
 
				         while(vecBlkCnt > 0)
			
--- a/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
+++ b/Source/ComplexMathFunctions/arm_cmplx_dot_prod_f32.c
@@ -169,7 +169,7 @@ void arm_cmplx_dot_prod_f32(
 
				 	/* C = (A[0]+jA[1])*(B[0]+jB[1]) + ...  */
			
 
				         /* Calculate dot product and then store the result in a temporary buffer. */
			
 
				 
			
 
				-	vec1 = vld2q_f32(pSrcA);
			
 
				+	      vec1 = vld2q_f32(pSrcA);
			
 
				         vec2 = vld2q_f32(pSrcB);
			
 
				 
			
 
				 	/* Increment pointers */
			
@@ -204,10 +204,10 @@ void arm_cmplx_dot_prod_f32(
 
				     }
			
 
				 
			
 
				     accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR));
			
 
				-    real_sum += accum[0] + accum[1];
			
 
				+    real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				     accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI));
			
 
				-    imag_sum += accum[0] + accum[1];
			
 
				+    imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				     /* Tail */
			
 
				     blkCnt = numSamples & 0x7;
			
--- a/Source/DistanceFunctions/arm_boolean_distance_template.h
+++ b/Source/DistanceFunctions/arm_boolean_distance_template.h
@@ -351,16 +351,16 @@ void FUNC(EXT)(const uint32_t *pA
 
				     }
			
 
				 
			
 
				 #ifdef TT
			
 
				-    _ctt += tmp4tt[0] + tmp4tt[1];
			
 
				+    _ctt += vgetq_lane_u64(tmp4tt, 0) + vgetq_lane_u64(tmp4tt, 1);
			
 
				 #endif
			
 
				 #ifdef FF
			
 
				-    _cff += tmp4ff[0] + tmp4ff[1];
			
 
				+    _cff +=vgetq_lane_u64(tmp4ff, 0) + vgetq_lane_u64(tmp4ff, 1);
			
 
				 #endif
			
 
				 #ifdef TF
			
 
				-    _ctf += tmp4tf[0] + tmp4tf[1];
			
 
				+    _ctf += vgetq_lane_u64(tmp4tf, 0) + vgetq_lane_u64(tmp4tf, 1);
			
 
				 #endif
			
 
				 #ifdef FT
			
 
				-    _cft += tmp4ft[0] + tmp4ft[1];
			
 
				+    _cft += vgetq_lane_u64(tmp4ft, 0) + vgetq_lane_u64(tmp4ft, 1);
			
 
				 #endif
			
 
				 
			
 
				     nbBoolBlock = numberOfBools & 0x7F;
			
--- a/Source/DistanceFunctions/arm_braycurtis_distance_f32.c
+++ b/Source/DistanceFunctions/arm_braycurtis_distance_f32.c
@@ -144,10 +144,10 @@ float32_t arm_braycurtis_distance_f32(const float32_t *pA,const float32_t *pB, u
 
				         blkCnt --;
			
 
				    }
			
 
				    accumV2 = vpadd_f32(vget_low_f32(accumDiffV),vget_high_f32(accumDiffV));
			
 
				-   accumDiff = accumV2[0] + accumV2[1];
			
 
				+   accumDiff = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				    accumV2 = vpadd_f32(vget_low_f32(accumSumV),vget_high_f32(accumSumV));
			
 
				-   accumSum = accumV2[0] + accumV2[1];
			
 
				+   accumSum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				    blkCnt = blockSize & 3;
			
 
				    while(blkCnt > 0)
			
--- a/Source/DistanceFunctions/arm_canberra_distance_f32.c
+++ b/Source/DistanceFunctions/arm_canberra_distance_f32.c
@@ -135,7 +135,7 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin
 
				    uint32_t blkCnt;
			
 
				    float32x4_t a,b,c,accumV;
			
 
				    float32x2_t accumV2;
			
 
				-   int32x4_t   isZeroV;
			
 
				+   uint32x4_t   isZeroV;
			
 
				    float32x4_t zeroV = vdupq_n_f32(0.0f);
			
 
				 
			
 
				    accumV = vdupq_n_f32(0.0f);
			
@@ -162,7 +162,7 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin
 
				          * Force result of a division by 0 to 0. It the behavior of the
			
 
				          * sklearn canberra function.
			
 
				          */
			
 
				-        a = vbicq_s32(a,isZeroV);
			
 
				+        a = vreinterpretq_f32_s32(vbicq_s32(vreinterpretq_s32_f32(a),vreinterpretq_s32_u32(isZeroV)));
			
 
				         c = vmulq_f32(c,a);
			
 
				         accumV = vaddq_f32(accumV,c);
			
 
				 
			
@@ -171,7 +171,7 @@ float32_t arm_canberra_distance_f32(const float32_t *pA,const float32_t *pB, uin
 
				         blkCnt --;
			
 
				    }
			
 
				    accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				-   accum = accumV2[0] + accumV2[1];
			
 
				+   accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				 
			
 
				    blkCnt = blockSize & 3;
			
--- a/Source/DistanceFunctions/arm_chebyshev_distance_f32.c
+++ b/Source/DistanceFunctions/arm_chebyshev_distance_f32.c
@@ -158,7 +158,7 @@ float32_t arm_chebyshev_distance_f32(const float32_t *pA,const float32_t *pB, ui
 
				       }
			
 
				       maxValV2 = vpmax_f32(vget_low_f32(maxValV),vget_high_f32(maxValV));
			
 
				       maxValV2 = vpmax_f32(maxValV2,maxValV2);
			
 
				-      maxVal = maxValV2[0];
			
 
				+      maxVal = vget_lane_f32(maxValV2,0);
			
 
				 
			
 
				   
			
 
				       blkCnt = blockSize & 3;
			
--- a/Source/DistanceFunctions/arm_cityblock_distance_f32.c
+++ b/Source/DistanceFunctions/arm_cityblock_distance_f32.c
@@ -115,7 +115,7 @@ float32_t arm_cityblock_distance_f32(const float32_t *pA,const float32_t *pB, ui
 
				    }
			
 
				    accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				    accumV2 = vpadd_f32(accumV2,accumV2);
			
 
				-   accum = accumV2[0];
			
 
				+   accum = vget_lane_f32(accumV2,0);
			
 
				    
			
 
				 
			
 
				    blkCnt = blockSize & 3;
			
--- a/Source/DistanceFunctions/arm_euclidean_distance_f32.c
+++ b/Source/DistanceFunctions/arm_euclidean_distance_f32.c
@@ -114,7 +114,7 @@ float32_t arm_euclidean_distance_f32(const float32_t *pA,const float32_t *pB, ui
 
				         blkCnt --;
			
 
				    }
			
 
				    accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				-   accum = accumV2[0] + accumV2[1];
			
 
				+   accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				    blkCnt = blockSize & 3;
			
 
				    while(blkCnt > 0)
			
--- a/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c
+++ b/Source/DistanceFunctions/arm_jensenshannon_distance_f32.c
@@ -168,7 +168,7 @@ float32_t arm_jensenshannon_distance_f32(const float32_t *pA,const float32_t *pB
 
				     }
			
 
				 
			
 
				     accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				-    accum = accumV2[0] + accumV2[1];
			
 
				+    accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				     blkCnt = blockSize & 3;
			
 
				     while(blkCnt > 0)
			
--- a/Source/DistanceFunctions/arm_minkowski_distance_f32.c
+++ b/Source/DistanceFunctions/arm_minkowski_distance_f32.c
@@ -126,7 +126,7 @@ float32_t arm_minkowski_distance_f32(const float32_t *pA,const float32_t *pB, in
 
				     }
			
 
				 
			
 
				     sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
			
 
				-    sum = sumV2[0] + sumV2[1];
			
 
				+    sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
			
 
				 
			
 
				     blkCnt = blockSize & 3;
			
 
				     while(blkCnt > 0)
			
--- a/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
@@ -370,19 +370,20 @@ void arm_biquad_cascade_df1_f32(
 
				   while (stage > 0U)
			
 
				   {
			
 
				     /* Reading the coefficients */
			
 
				-    Xn = vdupq_n_f32(0.0);
			
 
				-    Xn[2] = pState[0];
			
 
				-    Xn[3] = pState[1];
			
 
				-    Yn[0] = pState[2];
			
 
				-    Yn[1] = pState[3];
			
 
				+    Xn = vdupq_n_f32(0.0f);
			
 
				 
			
 
				+    Xn = vsetq_lane_f32(pState[0],Xn,2);
			
 
				+    Xn = vsetq_lane_f32(pState[1],Xn,3);
			
 
				+    Yn = vset_lane_f32(pState[2],Yn,0);
			
 
				+    Yn = vset_lane_f32(pState[3],Yn,1);
			
 
				+  
			
 
				     b = vld1q_f32(pCoeffs);
			
 
				     b = vrev64q_f32(b);  
			
 
				     b = vcombine_f32(vget_high_f32(b), vget_low_f32(b));
			
 
				 
			
 
				     a = vld1_f32(pCoeffs + 3);
			
 
				     a = vrev64_f32(a);
			
 
				-    b[0] = 0.0;
			
 
				+    b = vsetq_lane_f32(0.0f,b,0);
			
 
				     pCoeffs += 5;
			
 
				     
			
 
				     /* Reading the pState values */
			
@@ -460,7 +461,11 @@ void arm_biquad_cascade_df1_f32(
 
				       Xns = *pIn++;
			
 
				 
			
 
				       /* acc =  b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
			
 
				-      acc =  (b[1] * Xn[2]) + (b[2] * Xn[3]) + (b[3] * Xns) + (a[0] * Yn[0]) + (a[1] * Yn[1]);
			
 
				+      acc =  (vgetq_lane_f32(b, 1) * vgetq_lane_f32(Xn, 2)) 
			
 
				+      + (vgetq_lane_f32(b, 2) * vgetq_lane_f32(Xn, 3)) 
			
 
				+      + (vgetq_lane_f32(b, 3) * Xns) 
			
 
				+      + (vget_lane_f32(a, 0) * vget_lane_f32(Yn, 0)) 
			
 
				+      + (vget_lane_f32(a, 1) * vget_lane_f32(Yn, 1));
			
 
				 
			
 
				       /* Store the result in the accumulator in the destination buffer. */
			
 
				       *pOut++ = acc;
			
@@ -471,10 +476,10 @@ void arm_biquad_cascade_df1_f32(
 
				       /* Xn1 = Xn    */
			
 
				       /* Yn2 = Yn1   */
			
 
				       /* Yn1 = acc   */
			
 
				-      Xn[2] = Xn[3];
			
 
				-      Xn[3] = Xns;
			
 
				-      Yn[0] = Yn[1];
			
 
				-      Yn[1] = acc;
			
 
				+      Xn = vsetq_lane_f32(vgetq_lane_f32(Xn, 3),Xn,2);
			
 
				+      Xn = vsetq_lane_f32(Xns,Xn,3);
			
 
				+      Yn = vset_lane_f32(vget_lane_f32(Yn, 1),Yn,0);
			
 
				+      Yn = vset_lane_f32(acc,Yn,1);
			
 
				 
			
 
				       /* Decrement the loop counter */
			
 
				       sample--;
			
--- a/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
+++ b/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
@@ -268,7 +268,7 @@ void arm_biquad_cascade_df2T_f32(
 
				          dV.val[1] = vmulq_f32(s, b2V);
			
 
				          dV.val[1] = vmlaq_f32(dV.val[1], YnV, a2V);
			
 
				 
			
 
				-         *pOut++ = YnV[3];
			
 
				+         *pOut++ = vgetq_lane_f32(YnV, 3) ;
			
 
				 
			
 
				          sample--;
			
 
				       }
			
--- a/Source/FilteringFunctions/arm_conv_f32.c
+++ b/Source/FilteringFunctions/arm_conv_f32.c
@@ -263,7 +263,8 @@ void arm_conv_f32(
 
				         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
			
 
				         uint32_t j, k, count, blkCnt;                  /* Loop counters */
			
 
				 
			
 
				-#if defined (ARM_MATH_LOOPUNROLL)
			
 
				+
			
 
				+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
			
 
				         float32_t acc0, acc1, acc2, acc3, c0;              /* Accumulators */
			
 
				 #if !defined(ARM_MATH_NEON)
			
 
				         float32_t x0, x1, x2, x3;                  /* Temporary variables to hold state and coefficient values */
			
--- a/Source/FilteringFunctions/arm_conv_partial_f32.c
+++ b/Source/FilteringFunctions/arm_conv_partial_f32.c
@@ -142,7 +142,7 @@ arm_status arm_conv_partial_f32(
 
				     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
			
 
				     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
			
 
				     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
			
 
				-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
			
 
				+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0;
			
 
				     blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex);
			
 
				     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
			
 
				 
			
--- a/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
+++ b/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
@@ -118,7 +118,7 @@ arm_status arm_conv_partial_fast_q31(
 
				     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
			
 
				     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
			
 
				     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
			
 
				-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
			
 
				+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  numPoints) : 0;
			
 
				     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
			
 
				     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
			
 
				 
			
--- a/Source/FilteringFunctions/arm_conv_partial_q15.c
+++ b/Source/FilteringFunctions/arm_conv_partial_q15.c
@@ -119,7 +119,7 @@ arm_status arm_conv_partial_q15(
 
				     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
			
 
				     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
			
 
				     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
			
 
				-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
			
 
				+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  numPoints) : 0;
			
 
				     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
			
 
				     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
			
 
				 
			
--- a/Source/FilteringFunctions/arm_conv_partial_q31.c
+++ b/Source/FilteringFunctions/arm_conv_partial_q31.c
@@ -121,7 +121,7 @@ arm_status arm_conv_partial_q31(
 
				     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
			
 
				     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
			
 
				     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
			
 
				-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
			
 
				+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 :  numPoints) : 0;
			
 
				     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
			
 
				     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
			
 
				 
			
--- a/Source/FilteringFunctions/arm_conv_partial_q7.c
+++ b/Source/FilteringFunctions/arm_conv_partial_q7.c
@@ -123,7 +123,7 @@ arm_status arm_conv_partial_q7(
 
				     blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
			
 
				     blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
			
 
				     blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
			
 
				-    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
			
 
				+    blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : numPoints) : 0;
			
 
				     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
			
 
				     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
			
 
				 
			
--- a/Source/FilteringFunctions/arm_correlate_f32.c
+++ b/Source/FilteringFunctions/arm_correlate_f32.c
@@ -320,7 +320,7 @@ void arm_correlate_f32(
 
				         uint32_t outBlockSize;                         /* Loop counter */
			
 
				         int32_t inc = 1;                               /* Destination address modifier */
			
 
				 
			
 
				-#if defined (ARM_MATH_LOOPUNROLL) 
			
 
				+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
			
 
				     float32_t acc0, acc1, acc2, acc3,c0;                    /* Accumulators */
			
 
				 #if !defined(ARM_MATH_NEON)
			
 
				     float32_t x0, x1, x2, x3;                        /* temporary variables for holding input and coefficient values */
			
--- a/Source/FilteringFunctions/arm_fir_decimate_f32.c
+++ b/Source/FilteringFunctions/arm_fir_decimate_f32.c
@@ -219,16 +219,16 @@ void arm_fir_decimate_f32(
 
				     }
			
 
				 
			
 
				     temp = vpadd_f32(vget_low_f32(acc0v),vget_high_f32(acc0v));
			
 
				-    accv[0] = temp[0] + temp[1];
			
 
				+    accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,0);
			
 
				 
			
 
				     temp = vpadd_f32(vget_low_f32(acc1v),vget_high_f32(acc1v));
			
 
				-    accv[1] = temp[0] + temp[1];
			
 
				+    accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,1);
			
 
				 
			
 
				     temp = vpadd_f32(vget_low_f32(acc2v),vget_high_f32(acc2v));
			
 
				-    accv[2] = temp[0] + temp[1];
			
 
				+    accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,2);
			
 
				 
			
 
				     temp = vpadd_f32(vget_low_f32(acc3v),vget_high_f32(acc3v));
			
 
				-    accv[3] = temp[0] + temp[1];
			
 
				+    accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,3);
			
 
				 
			
 
				     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
			
 
				     tapCnt = numTaps % 0x4U;
			
@@ -245,10 +245,10 @@ void arm_fir_decimate_f32(
 
				       x3 = *(px3++);
			
 
				 
			
 
				       /* Perform the multiply-accumulate */
			
 
				-      accv[0] += x0 * c0;
			
 
				-      accv[1] += x1 * c0;
			
 
				-      accv[2] += x2 * c0;
			
 
				-      accv[3] += x3 * c0;
			
 
				+      accv = vsetq_lane_f32(vgetq_lane_f32(accv, 0) + x0 * c0,accv,0);
			
 
				+      accv = vsetq_lane_f32(vgetq_lane_f32(accv, 1) + x1 * c0,accv,1);
			
 
				+      accv = vsetq_lane_f32(vgetq_lane_f32(accv, 2) + x2 * c0,accv,2);
			
 
				+      accv = vsetq_lane_f32(vgetq_lane_f32(accv, 3) + x3 * c0,accv,3);
			
 
				 
			
 
				       /* Decrement the loop counter */
			
 
				       tapCnt--;
			
@@ -306,7 +306,7 @@ void arm_fir_decimate_f32(
 
				     }
			
 
				 
			
 
				     temp = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
			
 
				-    sum0 = temp[0] + temp[1];
			
 
				+    sum0 = vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1);
			
 
				 
			
 
				     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
			
 
				     tapCnt = numTaps % 0x4U;
			
--- a/Source/FilteringFunctions/arm_fir_f32.c
+++ b/Source/FilteringFunctions/arm_fir_f32.c
@@ -767,26 +767,26 @@ uint32_t blockSize)
 
				          b = vld1q_f32(pb);
			
 
				          xa = x0;
			
 
				          xb = x1;
			
 
				-         accv0 = vmlaq_n_f32(accv0,xa,b[0]);
			
 
				-         accv1 = vmlaq_n_f32(accv1,xb,b[0]);
			
 
				+         accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
			
 
				+         accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
			
 
				 
			
 
				          xa = vextq_f32(x0,x1,1);
			
 
				          xb = vextq_f32(x1,x2,1);
			
 
				-         
			
 
				-	 accv0 = vmlaq_n_f32(accv0,xa,b[1]);
			
 
				-         accv1 = vmlaq_n_f32(accv1,xb,b[1]);
			
 
				+        
			
 
				+         accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
			
 
				+         accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
			
 
				 
			
 
				-	 xa = vextq_f32(x0,x1,2);
			
 
				+         xa = vextq_f32(x0,x1,2);
			
 
				          xb = vextq_f32(x1,x2,2);
			
 
				 
			
 
				-         accv0 = vmlaq_n_f32(accv0,xa,b[2]);
			
 
				-         accv1 = vmlaq_n_f32(accv1,xb,b[2]);
			
 
				+         accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
			
 
				+         accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
			
 
				 
			
 
				-	 xa = vextq_f32(x0,x1,3);
			
 
				-	 xb = vextq_f32(x1,x2,3);
			
 
				+         xa = vextq_f32(x0,x1,3);
			
 
				+         xb = vextq_f32(x1,x2,3);
			
 
				          
			
 
				- 	 accv0 = vmlaq_n_f32(accv0,xa,b[3]);
			
 
				-         accv1 = vmlaq_n_f32(accv1,xb,b[3]);
			
 
				+         accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
			
 
				+         accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
			
 
				 
			
 
				          pb += 4;
			
 
				          x0 = x1;
			
@@ -810,8 +810,8 @@ uint32_t blockSize)
 
				 
			
 
				            pb++;
			
 
				 
			
 
				-	   xa = vextq_f32(x0,x1,1);
			
 
				-	   xb = vextq_f32(x1,x2,1);
			
 
				+           xa = vextq_f32(x0,x1,1);
			
 
				+           xb = vextq_f32(x1,x2,1);
			
 
				 
			
 
				            accv0 = vmlaq_n_f32(accv0,xa,*pb);
			
 
				            accv1 = vmlaq_n_f32(accv1,xb,*pb);
			
@@ -821,7 +821,7 @@ uint32_t blockSize)
 
				            xa = vextq_f32(x0,x1,2);
			
 
				            xb = vextq_f32(x1,x2,2);
			
 
				            
			
 
				-	   accv0 = vmlaq_n_f32(accv0,xa,*pb);
			
 
				+           accv0 = vmlaq_n_f32(accv0,xa,*pb);
			
 
				            accv1 = vmlaq_n_f32(accv1,xb,*pb);
			
 
				 
			
 
				          }
			
@@ -836,7 +836,7 @@ uint32_t blockSize)
 
				            xa = vextq_f32(x0,x1,1);
			
 
				            xb = vextq_f32(x1,x2,1);
			
 
				            
			
 
				-	   accv0 = vmlaq_n_f32(accv0,xa,*pb);
			
 
				+           accv0 = vmlaq_n_f32(accv0,xa,*pb);
			
 
				            accv1 = vmlaq_n_f32(accv1,xb,*pb);
			
 
				 
			
 
				          }
			
--- a/Source/FilteringFunctions/arm_fir_interpolate_f32.c
+++ b/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@@ -314,15 +314,15 @@ void arm_fir_interpolate_f32(
 
				       }
			
 
				 
			
 
				       /* The result is in the accumulator, store in the destination buffer. */
			
 
				-      *pDst = accV0[0];
			
 
				-      *(pDst + S->L) = accV0[1];
			
 
				-      *(pDst + 2 * S->L) = accV0[2];
			
 
				-      *(pDst + 3 * S->L) = accV0[3];
			
 
				+      *pDst = vgetq_lane_f32(accV0, 0);
			
 
				+      *(pDst + S->L) = vgetq_lane_f32(accV0, 1);
			
 
				+      *(pDst + 2 * S->L) = vgetq_lane_f32(accV0, 2);
			
 
				+      *(pDst + 3 * S->L) = vgetq_lane_f32(accV0, 3);
			
 
				 
			
 
				-      *(pDst + 4 * S->L) = accV1[0];
			
 
				-      *(pDst + 5 * S->L) = accV1[1];
			
 
				-      *(pDst + 6 * S->L) = accV1[2];
			
 
				-      *(pDst + 7 * S->L) = accV1[3];
			
 
				+      *(pDst + 4 * S->L) = vgetq_lane_f32(accV1, 0);
			
 
				+      *(pDst + 5 * S->L) = vgetq_lane_f32(accV1, 1);
			
 
				+      *(pDst + 6 * S->L) = vgetq_lane_f32(accV1, 2);
			
 
				+      *(pDst + 7 * S->L) = vgetq_lane_f32(accV1, 3);
			
 
				 
			
 
				       pDst++;
			
 
				 
			
@@ -375,7 +375,7 @@ void arm_fir_interpolate_f32(
 
				       while (tapCnt > 0U)
			
 
				       {
			
 
				         /* Read the coefficient */
			
 
				-        x1v[0] = *(ptr2);
			
 
				+        x1v = vsetq_lane_f32(*(ptr2),x1v,0);
			
 
				 
			
 
				         /* Upsampling is done by stuffing L-1 zeros between each sample.
			
 
				          * So instead of multiplying zeros with coefficients,
			
@@ -387,19 +387,19 @@ void arm_fir_interpolate_f32(
 
				         ptr1 += 4;
			
 
				 
			
 
				         /* Read the coefficient */
			
 
				-        x1v[1] = *(ptr2);
			
 
				+        x1v = vsetq_lane_f32(*(ptr2),x1v,1);
			
 
				 
			
 
				         /* Increment the coefficient pointer by interpolation factor times. */
			
 
				         ptr2 += S->L;
			
 
				 
			
 
				         /* Read the coefficient */
			
 
				-        x1v[2] = *(ptr2);
			
 
				+        x1v = vsetq_lane_f32(*(ptr2),x1v,2);
			
 
				 
			
 
				         /* Increment the coefficient pointer by interpolation factor times. */
			
 
				         ptr2 += S->L;
			
 
				 
			
 
				         /* Read the coefficient */
			
 
				-        x1v[3] = *(ptr2);
			
 
				+        x1v = vsetq_lane_f32(*(ptr2),x1v,3);
			
 
				 
			
 
				         /* Increment the coefficient pointer by interpolation factor times. */
			
 
				         ptr2 += S->L;
			
@@ -411,7 +411,7 @@ void arm_fir_interpolate_f32(
 
				       }
			
 
				 
			
 
				       tempV = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
			
 
				-      sum0 = tempV[0] + tempV[1];
			
 
				+      sum0 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
			
 
				 
			
 
				       /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
			
 
				       tapCnt = phaseLen % 0x4U;
			
--- a/Source/FilteringFunctions/arm_lms_f32.c
+++ b/Source/FilteringFunctions/arm_lms_f32.c
@@ -225,7 +225,7 @@ void arm_lms_f32(
 
				       tapCnt--;
			
 
				     }
			
 
				     tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
			
 
				-    sum = tempV2[0] + tempV2[1];
			
 
				+    sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
			
 
				 
			
 
				 
			
 
				     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
			
--- a/Source/FilteringFunctions/arm_lms_norm_f32.c
+++ b/Source/FilteringFunctions/arm_lms_norm_f32.c
@@ -233,7 +233,7 @@ void arm_lms_norm_f32(
 
				       tapCnt--;
			
 
				     }
			
 
				     tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
			
 
				-    sum = tempV2[0] + tempV2[1];
			
 
				+    sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
			
 
				 
			
 
				     /* If the filter length is not a multiple of 4, compute the remaining filter taps */
			
 
				     tapCnt = numTaps % 0x4U;
			
--- a/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
@@ -938,20 +938,21 @@ arm_status arm_mat_cmplx_mult_f32(
 
				           pIn1 += 8;
			
 
				           pIn1B += 8;
			
 
				 
			
 
				-          tempR[0] = *pIn2;
			
 
				-          tempI[0] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,0);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				-          tempR[1] = *pIn2;
			
 
				-          tempI[1] = *(pIn2 + 1U);
			
 
				+
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,1);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				-          tempR[2] = *pIn2;
			
 
				-          tempI[2] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,2);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				-          tempR[3] = *pIn2;
			
 
				-          tempI[3] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,3);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				           accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
			
@@ -971,16 +972,16 @@ arm_status arm_mat_cmplx_mult_f32(
 
				         }
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
			
 
				-        sumReal1 += accum[0] + accum[1];
			
 
				+        sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
			
 
				-        sumImag1 += accum[0] + accum[1];
			
 
				+        sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1));
			
 
				-        sumReal1B += accum[0] + accum[1];
			
 
				+        sumReal1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1));
			
 
				-        sumImag1B += accum[0] + accum[1];
			
 
				+        sumImag1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
			
 
				          ** No loop unrolling is used. */
			
@@ -1088,20 +1089,20 @@ arm_status arm_mat_cmplx_mult_f32(
 
				           a0V = vld2q_f32(pIn1);  // load & separate real/imag pSrcA (de-interleave 2)
			
 
				           pIn1 += 8;
			
 
				 
			
 
				-          tempR[0] = *pIn2;
			
 
				-          tempI[0] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,0);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				-          tempR[1] = *pIn2;
			
 
				-          tempI[1] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,1);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				-          tempR[2] = *pIn2;
			
 
				-          tempI[2] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,2);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				-          tempR[3] = *pIn2;
			
 
				-          tempI[3] = *(pIn2 + 1U);
			
 
				+          tempR = vsetq_lane_f32(*pIn2,tempR,3);
			
 
				+          tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3);
			
 
				           pIn2 += 2 * numColsB;
			
 
				 
			
 
				           accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
			
@@ -1115,10 +1116,10 @@ arm_status arm_mat_cmplx_mult_f32(
 
				         }
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
			
 
				-        sumReal1 += accum[0] + accum[1];
			
 
				+        sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
			
 
				-        sumImag1 += accum[0] + accum[1];
			
 
				+        sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
			
 
				          ** No loop unrolling is used. */
			
--- a/Source/MatrixFunctions/arm_mat_inverse_f32.c
+++ b/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -774,7 +774,7 @@ arm_status arm_mat_inverse_f32(
 
				 
			
 
				       /* Pivot element of the row */
			
 
				       in = *pPivotRowIn;
			
 
				-      tmpV = vdupq_n_f32(1.0/in);
			
 
				+      tmpV = vdupq_n_f32(1.0f/in);
			
 
				 
			
 
				       /* Loop over number of columns
			
 
				        * to the right of the pilot element */
			
--- a/Source/MatrixFunctions/arm_mat_mult_f32.c
+++ b/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -633,7 +633,7 @@ arm_status arm_mat_mult_f32(
 
				           a6V = vld1q_f32(pIn1G); 
			
 
				           a7V = vld1q_f32(pIn1H); 
			
 
				 
			
 
				-	  pIn1 += 4;
			
 
				+	      pIn1 += 4;
			
 
				           pIn1B += 4;
			
 
				           pIn1C += 4;
			
 
				           pIn1D += 4;
			
@@ -642,13 +642,13 @@ arm_status arm_mat_mult_f32(
 
				           pIn1G += 4;
			
 
				           pIn1H += 4;
			
 
				           
			
 
				-          temp[0] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,0);
			
 
				           pIn2 += numColsB;
			
 
				-          temp[1] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,1);
			
 
				           pIn2 += numColsB;
			
 
				-          temp[2] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,2);
			
 
				           pIn2 += numColsB;
			
 
				-          temp[3] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,3);
			
 
				           pIn2 += numColsB;
			
 
				 
			
 
				           acc0 = vmlaq_f32(acc0,a0V,temp);
			
@@ -665,28 +665,28 @@ arm_status arm_mat_mult_f32(
 
				         }
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
			
 
				-        sum0 += accum[0] + accum[1];
			
 
				+        sum0 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
			
 
				-        sum1 += accum[0] + accum[1];
			
 
				+        sum1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
			
 
				-        sum2 += accum[0] + accum[1];
			
 
				+        sum2 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
			
 
				-        sum3 += accum[0] + accum[1];
			
 
				+        sum3 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
			
 
				-        sum4 += accum[0] + accum[1];
			
 
				+        sum4 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
			
 
				-        sum5 += accum[0] + accum[1];
			
 
				+        sum5 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
			
 
				-        sum6 += accum[0] + accum[1];
			
 
				+        sum6 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
			
 
				-        sum7 += accum[0] + accum[1];
			
 
				+        sum7 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
			
 
				          ** No loop unrolling is used. */
			
@@ -782,13 +782,13 @@ arm_status arm_mat_mult_f32(
 
				           a0V = vld1q_f32(pIn1);  // load & separate real/imag pSrcA (de-interleave 2)
			
 
				           pIn1 += 4;
			
 
				           
			
 
				-          temp[0] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,0);
			
 
				           pIn2 += numColsB;
			
 
				-          temp[1] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,1);
			
 
				           pIn2 += numColsB;
			
 
				-          temp[2] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,2);
			
 
				           pIn2 += numColsB;
			
 
				-          temp[3] = *pIn2;
			
 
				+          temp = vsetq_lane_f32(*pIn2,temp,3);
			
 
				           pIn2 += numColsB;
			
 
				 
			
 
				           acc0 = vmlaq_f32(acc0,a0V,temp);
			
@@ -798,7 +798,7 @@ arm_status arm_mat_mult_f32(
 
				         }
			
 
				 
			
 
				         accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
			
 
				-        sum += accum[0] + accum[1];
			
 
				+        sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
			
 
				 
			
 
				         /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
			
 
				          ** No loop unrolling is used. */
			
--- a/Source/SVMFunctions/arm_svm_linear_predict_f32.c
+++ b/Source/SVMFunctions/arm_svm_linear_predict_f32.c
@@ -344,25 +344,25 @@ void arm_svm_linear_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
			
 
				-        dotV[0] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
			
 
				-        dotV[1] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
			
 
				-        dotV[2] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
			
 
				-        dotV[3] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
 
				         while (blkCnt > 0U)
			
 
				         {
			
 
				-            dotV[0] = dotV[0] + *pIn * *pSupporta++;
			
 
				-            dotV[1] = dotV[1] + *pIn * *pSupportb++;
			
 
				-            dotV[2] = dotV[2] + *pIn * *pSupportc++;
			
 
				-            dotV[3] = dotV[3] + *pIn * *pSupportd++;
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
			
 
				 
			
 
				             pIn++;
			
 
				 
			
@@ -374,7 +374,7 @@ void arm_svm_linear_predict_f32(
 
				 
			
 
				         accum = vmulq_f32(vec1,dotV);
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        sum += accum2[0] + accum2[1];
			
 
				+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				         pSupporta += 3*S->vectorDimension;
			
 
				         pSupportb += 3*S->vectorDimension;
			
@@ -406,7 +406,7 @@ void arm_svm_linear_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        dot = accum2[0] + accum2[1];
			
 
				+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
--- a/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
+++ b/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
@@ -370,25 +370,25 @@ void arm_svm_polynomial_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
			
 
				-        dotV[0] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
			
 
				-        dotV[1] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
			
 
				-        dotV[2] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
			
 
				-        dotV[3] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
 
				         while (blkCnt > 0U)
			
 
				         {
			
 
				-            dotV[0] = dotV[0] + *pIn * *pSupporta++;
			
 
				-            dotV[1] = dotV[1] + *pIn * *pSupportb++;
			
 
				-            dotV[2] = dotV[2] + *pIn * *pSupportc++;
			
 
				-            dotV[3] = dotV[3] + *pIn * *pSupportd++;
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
			
 
				 
			
 
				             pIn++;
			
 
				 
			
@@ -406,7 +406,7 @@ void arm_svm_polynomial_predict_f32(
 
				 
			
 
				         accum = vmulq_f32(vec1,dotV);
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        sum += accum2[0] + accum2[1];
			
 
				+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				         pSupporta += 3*S->vectorDimension;
			
 
				         pSupportb += 3*S->vectorDimension;
			
@@ -439,7 +439,7 @@ void arm_svm_polynomial_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        dot = accum2[0] + accum2[1];
			
 
				+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
--- a/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
+++ b/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
@@ -396,25 +396,26 @@ void arm_svm_rbf_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
			
 
				-        dotV[0] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
			
 
				-        dotV[1] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
			
 
				-        dotV[2] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
			
 
				-        dotV[3] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
 
				         while (blkCnt > 0U)
			
 
				         {
			
 
				-            dotV[0] = dotV[0] + SQ(*pIn - *pSupporta);
			
 
				-            dotV[1] = dotV[1] + SQ(*pIn - *pSupportb);
			
 
				-            dotV[2] = dotV[2] + SQ(*pIn - *pSupportc);
			
 
				-            dotV[3] = dotV[3] + SQ(*pIn - *pSupportd);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3);
			
 
				+
			
 
				             pSupporta++;
			
 
				             pSupportb++;
			
 
				             pSupportc++;
			
@@ -434,7 +435,7 @@ void arm_svm_rbf_predict_f32(
 
				 
			
 
				         accum = vmulq_f32(vec1,dotV);
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        sum += accum2[0] + accum2[1];
			
 
				+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				         pSupporta += 3*S->vectorDimension;
			
 
				         pSupportb += 3*S->vectorDimension;
			
@@ -468,7 +469,7 @@ void arm_svm_rbf_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        dot = accum2[0] + accum2[1];
			
 
				+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
--- a/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
+++ b/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
@@ -368,25 +368,25 @@ void arm_svm_sigmoid_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
			
 
				-        dotV[0] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
			
 
				-        dotV[1] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
			
 
				-        dotV[2] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
			
 
				 
			
 
				         accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
			
 
				-        dotV[3] = accum2[0] + accum2[1];
			
 
				+        dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
 
				         while (blkCnt > 0U)
			
 
				         {
			
 
				-            dotV[0] = dotV[0] + *pIn * *pSupporta++;
			
 
				-            dotV[1] = dotV[1] + *pIn * *pSupportb++;
			
 
				-            dotV[2] = dotV[2] + *pIn * *pSupportc++;
			
 
				-            dotV[3] = dotV[3] + *pIn * *pSupportd++;
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
			
 
				+            dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
			
 
				 
			
 
				             pIn++;
			
 
				 
			
@@ -404,7 +404,7 @@ void arm_svm_sigmoid_predict_f32(
 
				 
			
 
				         accum = vmulq_f32(vec1,dotV);
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        sum += accum2[0] + accum2[1];
			
 
				+        sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				         pSupporta += 3*S->vectorDimension;
			
 
				         pSupportb += 3*S->vectorDimension;
			
@@ -437,7 +437,7 @@ void arm_svm_sigmoid_predict_f32(
 
				             blkCnt -- ;
			
 
				         }
			
 
				         accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
			
 
				-        dot = accum2[0] + accum2[1];
			
 
				+        dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
			
 
				 
			
 
				 
			
 
				         blkCnt = S->vectorDimension & 3;
			
--- a/Source/StatisticsFunctions/arm_entropy_f32.c
+++ b/Source/StatisticsFunctions/arm_entropy_f32.c
@@ -125,7 +125,8 @@ float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
 
				     }
			
 
				 
			
 
				     accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				-    accum = accumV2[0] + accumV2[1];
			
 
				+    accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				+    
			
 
				 
			
 
				     blkCnt = blockSize & 3;
			
 
				     while(blkCnt > 0)
			
--- a/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
+++ b/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
@@ -142,7 +142,7 @@ float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSr
 
				     }
			
 
				 
			
 
				     accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				-    accum = accumV2[0] + accumV2[1];
			
 
				+    accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				     blkCnt = blockSize & 3;
			
 
				     while(blkCnt > 0)
			
--- a/Source/StatisticsFunctions/arm_logsumexp_f32.c
+++ b/Source/StatisticsFunctions/arm_logsumexp_f32.c
@@ -173,7 +173,7 @@ float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
 
				 
			
 
				       accumV2 = vpmax_f32(vget_low_f32(maxValV),vget_high_f32(maxValV));
			
 
				       accumV2 = vpmax_f32(accumV2,accumV2);
			
 
				-      maxVal = accumV2[0];
			
 
				+      maxVal = vget_lane_f32(accumV2, 0) ;
			
 
				 
			
 
				       blkCnt = (blockSize - 4) & 3;
			
 
				 
			
@@ -211,7 +211,7 @@ float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
 
				     
			
 
				     }
			
 
				     accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
			
 
				-    accum = accumV2[0] + accumV2[1];
			
 
				+    accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
			
 
				 
			
 
				     blkCnt = blockSize & 0x3;
			
 
				     while(blkCnt > 0)
			
--- a/Source/StatisticsFunctions/arm_max_f32.c
+++ b/Source/StatisticsFunctions/arm_max_f32.c
@@ -150,20 +150,26 @@ void arm_max_f32(
 
				   uint32_t * pIndex)
			
 
				 {
			
 
				   float32_t maxVal1, out;               /* Temporary variables to store the output value. */
			
 
				-  uint32_t blkCnt, outIndex, count;              /* loop counter */
			
 
				+  uint32_t blkCnt, outIndex;              /* loop counter */
			
 
				 
			
 
				   float32x4_t outV, srcV;
			
 
				   float32x2_t outV2;
			
 
				 
			
 
				   uint32x4_t idxV;
			
 
				-  uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
			
 
				-  uint32x4_t index={4,5,6,7};
			
 
				-  uint32x4_t delta={4,4,4,4};
			
 
				-  uint32x4_t countV={0,1,2,3};
			
 
				+  uint32x4_t maxIdx;
			
 
				+  static const uint32_t indexInit[4]={4,5,6,7};
			
 
				+  static const uint32_t countVInit[4]={0,1,2,3};
			
 
				+
			
 
				+  uint32x4_t index;
			
 
				+  uint32x4_t delta;
			
 
				+  uint32x4_t countV;
			
 
				   uint32x2_t countV2;
			
 
				 
			
 
				-  /* Initialise the count value. */
			
 
				-  count = 0U;
			
 
				+  maxIdx = vdupq_n_u32(ULONG_MAX);
			
 
				+  delta = vdupq_n_u32(4);
			
 
				+  index = vld1q_u32(indexInit);
			
 
				+  countV = vld1q_u32(countVInit);
			
 
				+
			
 
				 
			
 
				   /* Initialise the index value to zero. */
			
 
				   outIndex = 0U;
			
@@ -217,14 +223,14 @@ void arm_max_f32(
 
				     
			
 
				       outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
			
 
				       outV2 = vpmax_f32(outV2,outV2);
			
 
				-      out = outV2[0];
			
 
				+      out = vget_lane_f32(outV2, 0);
			
 
				     
			
 
				       idxV = vceqq_f32(outV, vdupq_n_f32(out));
			
 
				       countV = vbslq_u32(idxV, countV,maxIdx);
			
 
				       
			
 
				       countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
			
 
				       countV2 = vpmin_u32(countV2,countV2);
			
 
				-      outIndex = countV2[0];
			
 
				+      outIndex = vget_lane_u32(countV2,0); 
			
 
				     
			
 
				       /* if (blockSize - 1U) is not multiple of 4 */
			
 
				       blkCnt = (blockSize - 4 ) % 4U;
			
--- a/Source/StatisticsFunctions/arm_max_no_idx_f32.c
+++ b/Source/StatisticsFunctions/arm_max_no_idx_f32.c
@@ -135,4 +135,4 @@ void arm_max_no_idx_f32(
 
				 
			
 
				 /**
			
 
				   @} end of Max group
			
 
				- */
			
 
				+ */
			
--- a/Source/StatisticsFunctions/arm_min_f32.c
+++ b/Source/StatisticsFunctions/arm_min_f32.c
@@ -152,20 +152,24 @@ void arm_min_f32(
 
				   uint32_t * pIndex)
			
 
				 {
			
 
				   float32_t maxVal1, out;               /* Temporary variables to store the output value. */
			
 
				-  uint32_t blkCnt, outIndex, count;              /* loop counter */
			
 
				+  uint32_t blkCnt, outIndex;              /* loop counter */
			
 
				 
			
 
				   float32x4_t outV, srcV;
			
 
				   float32x2_t outV2;
			
 
				 
			
 
				   uint32x4_t idxV;
			
 
				-  uint32x4_t maxIdx={ULONG_MAX,ULONG_MAX,ULONG_MAX,ULONG_MAX};
			
 
				-  uint32x4_t index={4,5,6,7};
			
 
				-  uint32x4_t delta={4,4,4,4};
			
 
				-  uint32x4_t countV={0,1,2,3};
			
 
				+  static const uint32_t indexInit[4]={4,5,6,7};
			
 
				+  static const uint32_t countVInit[4]={0,1,2,3};
			
 
				+  uint32x4_t maxIdx;
			
 
				+  uint32x4_t index;
			
 
				+  uint32x4_t delta;
			
 
				+  uint32x4_t countV;
			
 
				   uint32x2_t countV2;
			
 
				 
			
 
				-  /* Initialise the count value. */
			
 
				-  count = 0U;
			
 
				+  maxIdx = vdupq_n_u32(ULONG_MAX);
			
 
				+  delta = vdupq_n_u32(4);
			
 
				+  index = vld1q_u32(indexInit);
			
 
				+  countV = vld1q_u32(countVInit);
			
 
				 
			
 
				   /* Initialise the index value to zero. */
			
 
				   outIndex = 0U;
			
@@ -219,14 +223,14 @@ void arm_min_f32(
 
				     
			
 
				       outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV));
			
 
				       outV2 = vpmin_f32(outV2,outV2);
			
 
				-      out = outV2[0];
			
 
				+      out = vget_lane_f32(outV2,0); 
			
 
				     
			
 
				       idxV = vceqq_f32(outV, vdupq_n_f32(out));
			
 
				       countV = vbslq_u32(idxV, countV,maxIdx);
			
 
				       
			
 
				       countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
			
 
				       countV2 = vpmin_u32(countV2,countV2);
			
 
				-      outIndex = countV2[0];
			
 
				+      outIndex = vget_lane_u32(countV2,0); 
			
 
				     
			
 
				       /* if (blockSize - 1U) is not multiple of 4 */
			
 
				       blkCnt = (blockSize - 4 ) % 4U;
			
--- a/Source/StatisticsFunctions/arm_power_f32.c
+++ b/Source/StatisticsFunctions/arm_power_f32.c
@@ -136,7 +136,7 @@ void arm_power_f32(
 
				     blkCnt--;
			
 
				   }
			
 
				   sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
			
 
				-  sum = sumV2[0] + sumV2[1];
			
 
				+  sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
			
 
				 
			
 
				   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
			
 
				    ** No loop unrolling is used. */
			
--- a/Source/StatisticsFunctions/arm_rms_f32.c
+++ b/Source/StatisticsFunctions/arm_rms_f32.c
@@ -103,7 +103,7 @@ void arm_rms_f32(
 
				   }
			
 
				 
			
 
				   sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
			
 
				-  sum = sumV2[0] + sumV2[1];
			
 
				+  sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
			
 
				 
			
 
				   /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
			
 
				    ** No loop unrolling is used. */
			
--- a/Source/SupportFunctions/arm_bitonic_sort_f32.c
+++ b/Source/SupportFunctions/arm_bitonic_sort_f32.c
@@ -834,47 +834,41 @@ static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
 
				     arm_bitonic_merge_128_f32(pSrc+128, dir);
			
 
				 }
			
 
				 
			
 
				+#define SWAP(a,i,j)                            \
			
 
				+    temp = vgetq_lane_f32(a, j);                   \
			
 
				+    a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
			
 
				+    a = vsetq_lane_f32(temp, a, i);
			
 
				+
			
 
				 static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
			
 
				 {
			
 
				     float32_t temp;
			
 
				 
			
 
				-    if( dir==(a[0]>a[1]) )
			
 
				+
			
 
				+    if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
			
 
				     {
			
 
				-    temp = a[1];
			
 
				-    a[1] = a[0];
			
 
				-    a[0] = temp;
			
 
				+        SWAP(a,0,1);
			
 
				     }
			
 
				-    if( dir==(a[2]>a[3]) )
			
 
				+    if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
			
 
				     {
			
 
				-    temp = a[3];
			
 
				-    a[3] = a[2];
			
 
				-    a[2] = temp;
			
 
				+       SWAP(a,2,3);
			
 
				     }
			
 
				 
			
 
				-    if( dir==(a[0]>a[3]) )
			
 
				+    if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
			
 
				     {
			
 
				-    temp = a[3];
			
 
				-    a[3] = a[0];
			
 
				-    a[0] = temp;
			
 
				+      SWAP(a,0,3);
			
 
				     }
			
 
				-    if( dir==(a[1]>a[2]) )
			
 
				+    if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
			
 
				     {
			
 
				-    temp = a[2];
			
 
				-    a[2] = a[1];
			
 
				-    a[1] = temp;
			
 
				+      SWAP(a,1,2);
			
 
				     }
			
 
				 
			
 
				-    if( dir==(a[0]>a[1]) )
			
 
				+    if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
			
 
				     {
			
 
				-    temp = a[1];
			
 
				-    a[1] = a[0];
			
 
				-    a[0] = temp;
			
 
				+      SWAP(a,0,1);
			
 
				     }
			
 
				-    if( dir==(a[2]>a[3]) )
			
 
				+    if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
			
 
				     {
			
 
				-    temp = a[3];
			
 
				-    a[3] = a[2];
			
 
				-    a[2] = temp;
			
 
				+      SWAP(a,2,3);
			
 
				     }
			
 
				 
			
 
				     return a;
			
--- a/Source/SupportFunctions/arm_spline_interp_f32.c
+++ b/Source/SupportFunctions/arm_spline_interp_f32.c
@@ -157,6 +157,11 @@ void arm_spline_f32(
 
				         float32_t * pDst,
			
 
				 	uint32_t blockSize)
			
 
				 {
			
 
				+    /* 
			
 
				+
			
 
				+    As explained in arm_spline_interp_init_f32.c, this must be > 1
			
 
				+    
			
 
				+    */
			
 
				     int32_t n = S->n_x;
			
 
				     arm_spline_type type = S->type;
			
 
				 
			
@@ -169,6 +174,9 @@ void arm_spline_f32(
 
				     float32_t bi, di;
			
 
				     float32_t x_sc;
			
 
				 
			
 
				+    bi = 0.0f;
			
 
				+    di = 0.0f;
			
 
				+
			
 
				     const float32_t * pXq = xq;
			
 
				 
			
 
				     int32_t blkCnt = (int32_t)blockSize;
			
--- a/Source/SupportFunctions/arm_spline_interp_init_f32.c
+++ b/Source/SupportFunctions/arm_spline_interp_init_f32.c
@@ -40,7 +40,7 @@
 
				 /**
			
 
				   * @brief Initialization function for the floating-point cubic spline interpolation.
			
 
				   * @param[in,out] S        points to an instance of the floating-point spline structure.
			
 
				-  * @param[in]     n        number of known data points.
			
 
				+  * @param[in]     n        number of known data points (must > 1).
			
 
				   * @param[in]     type     type of cubic spline interpolation (boundary conditions)
			
 
				   */
			
 
				 
			
--- a/Source/SupportFunctions/arm_weighted_sum_f32.c
+++ b/Source/SupportFunctions/arm_weighted_sum_f32.c
@@ -136,10 +136,10 @@ float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uin
 
				     }
			
 
				 
			
 
				     tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V));
			
 
				-    accum1 = tempV[0] + tempV[1];
			
 
				+    accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
			
 
				 
			
 
				     tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V));
			
 
				-    accum2 = tempV[0] + tempV[1];
			
 
				+    accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
			
 
				 
			
 
				     blkCnt = blockSize & 3;
			
 
				     while(blkCnt > 0)
			
--- a/Testing/FrameworkSource/Error.cpp
+++ b/Testing/FrameworkSource/Error.cpp
@@ -25,9 +25,10 @@
 
				  * See the License for the specific language governing permissions and
			
 
				  * limitations under the License.
			
 
				  */
			
 
				-#include "arm_math.h"
			
 
				-#include "Error.h"
			
 
				 #include <stdlib.h> 
			
 
				+#include <stdio.h>
			
 
				+#include "Error.h"
			
 
				+#include "arm_math.h"
			
 
				 
			
 
				 namespace Client {
			
 
				 
			
--- a/Testing/FrameworkSource/IORunner.cpp
+++ b/Testing/FrameworkSource/IORunner.cpp
@@ -33,6 +33,7 @@
 
				 #include <string>
			
 
				 #include <cstddef>
			
 
				 #include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				 #include "IORunner.h"
			
 
				 #include "Error.h"
			
 
				 #include "Timing.h"
			
--- a/Testing/RTE_Components.h
+++ b/Testing/RTE_Components.h
@@ -2,4 +2,4 @@
 
				 #define RTE_COMPONENTS_H
			
 
				 
			
 
				 
			
 
				-#endif /* RTE_COMPONENTS_H */
			
 
				+#endif /* RTE_COMPONENTS_H */
			
--- a/Testing/Source/Tests/BIQUADF32.cpp
+++ b/Testing/Source/Tests/BIQUADF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "BIQUADF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 98
			
@@ -31,10 +32,7 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				         arm_biquad_mod_coef_f32 *coefsmodp = (arm_biquad_mod_coef_f32*)vecCoefs.ptr();
			
 
				         #endif
			
 
				 
			
 
				-        int i,j;
			
 
				         int blockSize;
			
 
				-        int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -48,7 +46,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				         */
			
 
				         
			
 
				            blockSize = inputs.nbSamples() >> 1;
			
 
				-           numTaps = coefs.nbSamples();
			
 
				 
			
 
				            /*
			
 
				 
			
@@ -107,10 +104,7 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				         const float32_t *inputp = inputs.ptr();
			
 
				         float32_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				         int blockSize;
			
 
				-        int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -124,7 +118,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				         */
			
 
				         
			
 
				            blockSize = inputs.nbSamples() >> 1;
			
 
				-           numTaps = coefs.nbSamples();
			
 
				 
			
 
				            /*
			
 
				 
			
@@ -189,7 +182,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				 
			
 
				         int blockSize;
			
 
				         int numStages;
			
 
				-        int nb=0;
			
 
				         int i;
			
 
				 
			
 
				         
			
@@ -267,7 +259,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				 
			
 
				         int blockSize;
			
 
				         int numStages;
			
 
				-        int nb=0;
			
 
				 
			
 
				         int i;
			
 
				 
			
@@ -350,7 +341,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				 
			
 
				         int blockSize;
			
 
				         int numStages;
			
 
				-        int nb=0;
			
 
				 
			
 
				         int i;
			
 
				 
			
@@ -410,8 +400,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				     void BIQUADF32::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				-
			
 
				        
			
 
				        switch(id)
			
 
				        {
			
--- a/Testing/Source/Tests/BIQUADF64.cpp
+++ b/Testing/Source/Tests/BIQUADF64.cpp
@@ -1,4 +1,5 @@
 
				 #include "BIQUADF64.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 98
			
@@ -28,10 +29,7 @@ static __ALIGNED(8) float64_t coeffArray[64];
 
				         float64_t *inputp = inputs.ptr();
			
 
				         float64_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				         int blockSize;
			
 
				-        int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -45,7 +43,6 @@ static __ALIGNED(8) float64_t coeffArray[64];
 
				         */
			
 
				         
			
 
				            blockSize = inputs.nbSamples() >> 1;
			
 
				-           numTaps = coefs.nbSamples();
			
 
				 
			
 
				            /*
			
 
				 
			
@@ -98,7 +95,6 @@ static __ALIGNED(8) float64_t coeffArray[64];
 
				 
			
 
				         int blockSize;
			
 
				         int numStages;
			
 
				-        int nb=0;
			
 
				 
			
 
				         int i;
			
 
				 
			
@@ -158,8 +154,6 @@ static __ALIGNED(8) float64_t coeffArray[64];
 
				     void BIQUADF64::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				-
			
 
				        
			
 
				        switch(id)
			
 
				        {
			
--- a/Testing/Source/Tests/BIQUADQ15.cpp
+++ b/Testing/Source/Tests/BIQUADQ15.cpp
@@ -1,4 +1,5 @@
 
				 #include "BIQUADQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 
			
@@ -26,10 +27,7 @@ static __ALIGNED(8) q15_t coeffArray[32];
 
				         const q15_t *inputp = inputs.ptr();
			
 
				         q15_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				         int blockSize;
			
 
				-        int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -43,7 +41,6 @@ static __ALIGNED(8) q15_t coeffArray[32];
 
				         */
			
 
				         
			
 
				            blockSize = inputs.nbSamples() >> 1;
			
 
				-           numTaps = coefs.nbSamples();
			
 
				 
			
 
				            /*
			
 
				 
			
@@ -85,7 +82,6 @@ static __ALIGNED(8) q15_t coeffArray[32];
 
				     void BIQUADQ15::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				 
			
 
				        
			
 
				        switch(id)
			
--- a/Testing/Source/Tests/BIQUADQ31.cpp
+++ b/Testing/Source/Tests/BIQUADQ31.cpp
@@ -1,4 +1,5 @@
 
				 #include "BIQUADQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 
			
@@ -24,10 +25,7 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				         const q31_t *inputp = inputs.ptr();
			
 
				         q31_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				         int blockSize;
			
 
				-        int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -41,7 +39,6 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				         */
			
 
				         
			
 
				            blockSize = inputs.nbSamples() >> 1;
			
 
				-           numTaps = coefs.nbSamples();
			
 
				 
			
 
				            /*
			
 
				 
			
@@ -87,10 +84,7 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				         q31_t *inputp = inputs.ptr();
			
 
				         q31_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				         int blockSize;
			
 
				-        int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -104,7 +98,6 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				         */
			
 
				         
			
 
				            blockSize = inputs.nbSamples() >> 1;
			
 
				-           numTaps = coefs.nbSamples();
			
 
				 
			
 
				            /*
			
 
				 
			
@@ -145,8 +138,6 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				     void BIQUADQ31::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				-
			
 
				        
			
 
				        switch(id)
			
 
				        {
			
--- a/Testing/Source/Tests/BasicTestsF32.cpp
+++ b/Testing/Source/Tests/BasicTestsF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "BasicTestsF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 120
			
--- a/Testing/Source/Tests/BasicTestsQ15.cpp
+++ b/Testing/Source/Tests/BasicTestsQ15.cpp
@@ -1,4 +1,5 @@
 
				 #include "BasicTestsQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 70
			
--- a/Testing/Source/Tests/BasicTestsQ31.cpp
+++ b/Testing/Source/Tests/BasicTestsQ31.cpp
@@ -1,4 +1,5 @@
 
				 #include "BasicTestsQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 100
			
--- a/Testing/Source/Tests/BasicTestsQ7.cpp
+++ b/Testing/Source/Tests/BasicTestsQ7.cpp
@@ -1,4 +1,5 @@
 
				 #include "BasicTestsQ7.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 20
			
--- a/Testing/Source/Tests/BayesF32.cpp
+++ b/Testing/Source/Tests/BayesF32.cpp
@@ -1,9 +1,9 @@
 
				 #include "BayesF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				 
			
 
				 
			
 
				     void BayesF32::test_gaussian_naive_bayes_predict_f32()
			
--- a/Testing/Source/Tests/BinaryTestsF32.cpp
+++ b/Testing/Source/Tests/BinaryTestsF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "BinaryTestsF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 120
			
--- a/Testing/Source/Tests/BinaryTestsQ15.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ15.cpp
@@ -1,4 +1,5 @@
 
				 #include "BinaryTestsQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 70
			
--- a/Testing/Source/Tests/BinaryTestsQ31.cpp
+++ b/Testing/Source/Tests/BinaryTestsQ31.cpp
@@ -1,4 +1,5 @@
 
				 #include "BinaryTestsQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 100
			
--- a/Testing/Source/Tests/ComplexTestsF32.cpp
+++ b/Testing/Source/Tests/ComplexTestsF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "ComplexTestsF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 120
			
--- a/Testing/Source/Tests/ComplexTestsQ15.cpp
+++ b/Testing/Source/Tests/ComplexTestsQ15.cpp
@@ -1,4 +1,5 @@
 
				 #include "ComplexTestsQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 25
			
--- a/Testing/Source/Tests/ComplexTestsQ31.cpp
+++ b/Testing/Source/Tests/ComplexTestsQ31.cpp
@@ -1,4 +1,5 @@
 
				 #include "ComplexTestsQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 100
			
--- a/Testing/Source/Tests/DistanceTestsF32.cpp
+++ b/Testing/Source/Tests/DistanceTestsF32.cpp
@@ -1,9 +1,9 @@
 
				 #include "DistanceTestsF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				 
			
 
				 
			
 
				     void DistanceTestsF32::test_braycurtis_distance_f32()
			
--- a/Testing/Source/Tests/DistanceTestsU32.cpp
+++ b/Testing/Source/Tests/DistanceTestsU32.cpp
@@ -1,10 +1,9 @@
 
				 #include "DistanceTestsU32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 #define ERROR_THRESHOLD 1e-8
			
 
				 
			
 
				     void DistanceTestsU32::test_dice_distance()
			
--- a/Testing/Source/Tests/ExampleCategoryF32.cpp
+++ b/Testing/Source/Tests/ExampleCategoryF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "ExampleCategoryF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 /*
			
--- a/Testing/Source/Tests/ExampleCategoryQ15.cpp
+++ b/Testing/Source/Tests/ExampleCategoryQ15.cpp
@@ -1,4 +1,5 @@
 
				 #include "ExampleCategoryQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 70
			
--- a/Testing/Source/Tests/ExampleCategoryQ31.cpp
+++ b/Testing/Source/Tests/ExampleCategoryQ31.cpp
@@ -1,4 +1,5 @@
 
				 #include "ExampleCategoryQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 100
			
--- a/Testing/Source/Tests/ExampleCategoryQ7.cpp
+++ b/Testing/Source/Tests/ExampleCategoryQ7.cpp
@@ -1,4 +1,5 @@
 
				 #include "ExampleCategoryQ7.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 20
			
--- a/Testing/Source/Tests/FIRF32.cpp
+++ b/Testing/Source/Tests/FIRF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "FIRF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 120
			
@@ -27,10 +28,12 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				         const float32_t *inputp = inputs.ptr();
			
 
				         float32_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				+        int i;
			
 
				+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
			
 
				+        int j;
			
 
				+#endif
			
 
				         int blockSize;
			
 
				         int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         
			
 
				 
			
@@ -93,7 +96,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				            configp += 2;
			
 
				            orgcoefsp += numTaps;
			
 
				 
			
 
				-           nb += 2*blockSize;
			
 
				         }
			
 
				 
			
 
				 
			
@@ -109,7 +111,6 @@ static __ALIGNED(8) float32_t coeffArray[32];
 
				     void FIRF32::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				 
			
 
				        
			
 
				        switch(id)
			
--- a/Testing/Source/Tests/FIRQ15.cpp
+++ b/Testing/Source/Tests/FIRQ15.cpp
@@ -1,4 +1,5 @@
 
				 #include "FIRQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 59
			
@@ -21,10 +22,12 @@ static __ALIGNED(8) q15_t coeffArray[32];
 
				         const q15_t *inputp = inputs.ptr();
			
 
				         q15_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				+        int i;
			
 
				+#if defined(ARM_MATH_MVEI)
			
 
				+        int j;
			
 
				+#endif
			
 
				         int blockSize;
			
 
				         int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         /*
			
 
				 
			
@@ -86,7 +89,6 @@ static __ALIGNED(8) q15_t coeffArray[32];
 
				            configp += 2;
			
 
				            orgcoefsp += numTaps;
			
 
				 
			
 
				-           nb += 2*blockSize;
			
 
				 
			
 
				         }
			
 
				 
			
@@ -102,8 +104,6 @@ static __ALIGNED(8) q15_t coeffArray[32];
 
				     void FIRQ15::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				-
			
 
				        
			
 
				        switch(id)
			
 
				        {
			
--- a/Testing/Source/Tests/FIRQ31.cpp
+++ b/Testing/Source/Tests/FIRQ31.cpp
@@ -1,4 +1,5 @@
 
				 #include "FIRQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 100
			
@@ -20,10 +21,12 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				         const q31_t *inputp = inputs.ptr();
			
 
				         q31_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				+        int i;
			
 
				+#if defined(ARM_MATH_MVEI)
			
 
				+        int j;
			
 
				+#endif
			
 
				         int blockSize;
			
 
				         int numTaps;
			
 
				-        int nb=0;
			
 
				 
			
 
				         /*
			
 
				 
			
@@ -83,7 +86,6 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				            configp += 2;
			
 
				            orgcoefsp += numTaps;
			
 
				 
			
 
				-           nb += 2*blockSize;
			
 
				 
			
 
				         }
			
 
				 
			
@@ -99,8 +101,6 @@ static __ALIGNED(8) q31_t coeffArray[32];
 
				     void FIRQ31::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				-
			
 
				        
			
 
				        switch(id)
			
 
				        {
			
--- a/Testing/Source/Tests/FIRQ7.cpp
+++ b/Testing/Source/Tests/FIRQ7.cpp
@@ -1,4 +1,5 @@
 
				 #include "FIRQ7.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 #define SNR_THRESHOLD 10
			
@@ -21,7 +22,10 @@ static __ALIGNED(8) q7_t coeffArray[32];
 
				         const q7_t *inputp = inputs.ptr();
			
 
				         q7_t *outp = output.ptr();
			
 
				 
			
 
				-        int i,j;
			
 
				+        int i;
			
 
				+#if defined(ARM_MATH_MVEI)
			
 
				+        int j;
			
 
				+#endif
			
 
				         int blockSize;
			
 
				         int numTaps;
			
 
				 
			
@@ -97,8 +101,6 @@ static __ALIGNED(8) q7_t coeffArray[32];
 
				     void FIRQ7::setUp(Testing::testID_t id,std::vector<Testing::param_t>& params,Client::PatternMgr *mgr)
			
 
				     {
			
 
				       
			
 
				-       Testing::nbSamples_t nb=MAX_NB_SAMPLES; 
			
 
				-
			
 
				        
			
 
				        switch(id)
			
 
				        {
			
--- a/Testing/Source/Tests/FastMathF32.cpp
+++ b/Testing/Source/Tests/FastMathF32.cpp
@@ -1,10 +1,10 @@
 
				 #include "FastMathF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "arm_vec_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				 
			
 
				 #define SNR_THRESHOLD 120
			
 
				 /* 
			
--- a/Testing/Source/Tests/FastMathQ15.cpp
+++ b/Testing/Source/Tests/FastMathQ15.cpp
@@ -1,9 +1,9 @@
 
				 #include "FastMathQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				 
			
 
				 #define SNR_THRESHOLD 70
			
 
				 /* 
			
--- a/Testing/Source/Tests/FastMathQ31.cpp
+++ b/Testing/Source/Tests/FastMathQ31.cpp
@@ -1,9 +1,9 @@
 
				 #include "FastMathQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				 
			
 
				 #define SNR_THRESHOLD 100
			
 
				 /* 
			
--- a/Testing/Source/Tests/FullyConnected.cpp
+++ b/Testing/Source/Tests/FullyConnected.cpp
@@ -1,10 +1,11 @@
 
				 #include "FullyConnected.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_nnfunctions.h"
			
 
				 #include "Test.h"
			
 
				 #include "stdio.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				+
			
 
				 
			
 
				 void printPattern(char *s,Client::AnyPattern<q7_t> pat)
			
 
				 {
			
--- a/Testing/Source/Tests/MISCF32.cpp
+++ b/Testing/Source/Tests/MISCF32.cpp
@@ -1,11 +1,10 @@
 
				 #include "MISCF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "arm_vec_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 #define SNR_THRESHOLD 120
			
 
				 /* 
			
 
				 
			
--- a/Testing/Source/Tests/MISCQ15.cpp
+++ b/Testing/Source/Tests/MISCQ15.cpp
@@ -1,11 +1,10 @@
 
				 #include "MISCQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "arm_vec_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 #define SNR_THRESHOLD 70
			
 
				 /* 
			
 
				 
			
--- a/Testing/Source/Tests/MISCQ31.cpp
+++ b/Testing/Source/Tests/MISCQ31.cpp
@@ -1,11 +1,10 @@
 
				 #include "MISCQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "arm_vec_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 #define SNR_THRESHOLD 100
			
 
				 /* 
			
 
				 
			
--- a/Testing/Source/Tests/MISCQ7.cpp
+++ b/Testing/Source/Tests/MISCQ7.cpp
@@ -1,11 +1,10 @@
 
				 #include "MISCQ7.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "arm_vec_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 #define SNR_THRESHOLD 15
			
 
				 /* 
			
 
				 
			
--- a/Testing/Source/Tests/NNSupport.cpp
+++ b/Testing/Source/Tests/NNSupport.cpp
@@ -1,10 +1,9 @@
 
				 #include "NNSupport.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_nnfunctions.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 
			
 
				 
			
 
				     void NNSupport::test_nn_elementwise_add_s8()
			
--- a/Testing/Source/Tests/Pooling.cpp
+++ b/Testing/Source/Tests/Pooling.cpp
@@ -1,10 +1,9 @@
 
				 #include "Pooling.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_nnfunctions.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 
			
 
				     void Pooling::test_avgpool_s8()
			
 
				     {
			
--- a/Testing/Source/Tests/SVMF32.cpp
+++ b/Testing/Source/Tests/SVMF32.cpp
@@ -1,4 +1,5 @@
 
				 #include "SVMF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 
			
 
				 
			
--- a/Testing/Source/Tests/Softmax.cpp
+++ b/Testing/Source/Tests/Softmax.cpp
@@ -1,10 +1,9 @@
 
				 #include "Softmax.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_nnfunctions.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				-
			
 
				 /*
			
 
				 
			
 
				 Tests have shown that, compared to a float32 implementation
			
--- a/Testing/Source/Tests/StatsTestsF32.cpp
+++ b/Testing/Source/Tests/StatsTestsF32.cpp
@@ -1,9 +1,9 @@
 
				 #include "StatsTestsF32.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "arm_math.h"
			
 
				 #include "Test.h"
			
 
				 
			
 
				-#include <cstdio>
			
 
				 
			
 
				 #define SNR_THRESHOLD 120
			
 
				 /* 
			
--- a/Testing/Source/Tests/StatsTestsQ15.cpp
+++ b/Testing/Source/Tests/StatsTestsQ15.cpp
@@ -1,5 +1,6 @@
 
				 #include "arm_math.h"
			
 
				 #include "StatsTestsQ15.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "Test.h"
			
 
				 
			
--- a/Testing/Source/Tests/StatsTestsQ31.cpp
+++ b/Testing/Source/Tests/StatsTestsQ31.cpp
@@ -1,5 +1,6 @@
 
				 #include "arm_math.h"
			
 
				 #include "StatsTestsQ31.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "Test.h"
			
 
				 
			
--- a/Testing/Source/Tests/StatsTestsQ7.cpp
+++ b/Testing/Source/Tests/StatsTestsQ7.cpp
@@ -1,5 +1,6 @@
 
				 #include "arm_math.h"
			
 
				 #include "StatsTestsQ7.h"
			
 
				+#include <stdio.h>
			
 
				 #include "Error.h"
			
 
				 #include "Test.h"