6 år sedan · fc9c681e50
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -165,15 +165,70 @@ void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t b
 
				  */
			
 
				 void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
			
 
				 
			
 
				+/**
			
 
				+  @brief         Read 2 q15 elements and post increment pointer.
			
 
				+  @param[in]     in_q15   Pointer to pointer that holds address of input.
			
 
				+  @return        q31 value
			
 
				+ */
			
 
				+__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
			
 
				+{
			
 
				+  q31_t val;
			
 
				+
			
 
				+  memcpy(&val, *in_q15, 4);
			
 
				+  *in_q15 += 2;
			
 
				+
			
 
				+  return (val);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+  @brief         Read 4 q7 from q7 pointer and post increment pointer.
			
 
				+  @param[in]     in_q7       Pointer to pointer that holds address of input.
			
 
				+  @return        q31 value
			
 
				+ */
			
 
				+__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
			
 
				+{
			
 
				+  q31_t val;
			
 
				+  memcpy(&val, *in_q7, 4);
			
 
				+  *in_q7 += 4;
			
 
				+
			
 
				+  return (val);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+  @brief         Read 2 q15 from q15 pointer.
			
 
				+  @param[in]     in_q15   pointer to address of input.
			
 
				+  @return        q31 value
			
 
				+ */
			
 
				+__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
			
 
				+{
			
 
				+  q31_t val;
			
 
				+  memcpy(&val, in_q15, 4);
			
 
				+
			
 
				+  return (val);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+  @brief         Read 4 q7 values.
			
 
				+  @param[in]     in_q7       pointer to address of input.
			
 
				+  @return        q31 value
			
 
				+ */
			
 
				+__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
			
 
				+{
			
 
				+  q31_t val;
			
 
				+  memcpy(&val, in_q7, 4);
			
 
				+
			
 
				+  return (val);
			
 
				+}
			
 
				+
			
 
				 #if defined (ARM_MATH_DSP)
			
 
				 
			
 
				 /**
			
 
				  * @brief read and expand one q7 word into two q15 words
			
 
				  */
			
 
				 
			
 
				-__STATIC_FORCEINLINE void *read_and_pad(void *source, q31_t * out1, q31_t * out2)
			
 
				+__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t * out1, q31_t * out2)
			
 
				 {
			
 
				-        q31_t     inA = *__SIMD32(source)++;
			
 
				+        q31_t     inA = arm_nn_read_q7x4_ia(&source);
			
 
				         q31_t     inAbuf1 = __SXTB16(__ROR(inA, 8));
			
 
				         q31_t     inAbuf2 = __SXTB16(inA);
			
 
				 
			
@@ -192,9 +247,9 @@ __STATIC_FORCEINLINE void *read_and_pad(void *source, q31_t * out1, q31_t * out2
 
				  * @brief read and expand one q7 word into two q15 words with reordering
			
 
				  */
			
 
				 
			
 
				-__STATIC_FORCEINLINE q7_t *read_and_pad_reordered(q7_t *source, q31_t * out1, q31_t * out2)
			
 
				+__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t * out1, q31_t * out2)
			
 
				 {
			
 
				-        q31_t     inA = read_q7x4_ia(&source);
			
 
				+        q31_t     inA = arm_nn_read_q7x4_ia(&source);
			
 
				 #ifndef ARM_MATH_BIG_ENDIAN
			
 
				         *out2 = __SXTB16(__ROR(inA, 8));
			
 
				         *out1 = __SXTB16(inA);
			
@@ -209,9 +264,9 @@ __STATIC_FORCEINLINE q7_t *read_and_pad_reordered(q7_t *source, q31_t * out1, q3
 
				 /**
			
 
				  * @brief read and expand one q7 word into two q15 words with reordering and add an offset
			
 
				  */
			
 
				-__STATIC_FORCEINLINE q7_t *read_and_pad_reordered_with_offset(q7_t *source, q31_t * out1, q31_t * out2, q31_t offset)
			
 
				+__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered_with_offset(const q7_t *source, q31_t * out1, q31_t * out2, q31_t offset)
			
 
				 {
			
 
				-        q31_t     inA = read_q7x4_ia(&source);
			
 
				+        q31_t     inA = arm_nn_read_q7x4_ia(&source);
			
 
				 
			
 
				 #ifndef ARM_MATH_BIG_ENDIAN
			
 
				         *out2 = __SXTB16(__ROR(inA, 8));
			
@@ -226,7 +281,6 @@ __STATIC_FORCEINLINE q7_t *read_and_pad_reordered_with_offset(q7_t *source, q31_
 
				         return source;
			
 
				 }
			
 
				 
			
 
				-
			
 
				 #endif
			
 
				 
			
 
				 
			
@@ -369,61 +423,6 @@ __STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multip
 
				                                        RIGHT_SHIFT(shift));
			
 
				 }
			
 
				 
			
 
				-/**
			
 
				-  @brief         Read 2 q15 elements and post increment pointer.
			
 
				-  @param[in]     in_q15   Pointer to pointer that holds address of input.
			
 
				-  @return        q31 value
			
 
				- */
			
 
				-__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
			
 
				-{
			
 
				-  q31_t val;
			
 
				-
			
 
				-  memcpy(&val, *in_q15, 4);
			
 
				-  *in_q15 += 2;
			
 
				-
			
 
				-  return (val);
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				-  @brief         Read 4 q7 from q7 pointer and post increment pointer.
			
 
				-  @param[in]     in_q7       Pointer to pointer that holds address of input.
			
 
				-  @return        q31 value
			
 
				- */
			
 
				-__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
			
 
				-{
			
 
				-  q31_t val;
			
 
				-  memcpy(&val, *in_q7, 4);
			
 
				-  *in_q7 += 4;
			
 
				-
			
 
				-  return (val);
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				-  @brief         Read 2 q15 from q15 pointer.
			
 
				-  @param[in]     in_q15   pointer to address of input.
			
 
				-  @return        q31 value
			
 
				- */
			
 
				-__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
			
 
				-{
			
 
				-  q31_t val;
			
 
				-  memcpy(&val, in_q15, 4);
			
 
				-
			
 
				-  return (val);
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				-  @brief         Read 4 q7 values.
			
 
				-  @param[in]     in_q7       pointer to address of input.
			
 
				-  @return        q31 value
			
 
				- */
			
 
				-__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
			
 
				-{
			
 
				-  q31_t val;
			
 
				-  memcpy(&val, in_q7, 4);
			
 
				-
			
 
				-  return (val);
			
 
				-}
			
 
				-
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
@@ -66,7 +66,7 @@ void arm_relu_q15(q15_t *data, uint16_t size)
 
				 
			
 
				     while (i)
			
 
				     {
			
 
				-        in = arm_nn_read_q15x2_ia((const q15_t **)&input);
			
 
				+        in = read_q15x2_ia(&input);
			
 
				 
			
 
				         /* extract the first bit */
			
 
				         buf = __ROR(in & 0x80008000, 15);
			
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q7.c
@@ -66,7 +66,7 @@ void arm_relu_q7(q7_t *data, uint16_t size)
 
				 
			
 
				     while (i)
			
 
				     {
			
 
				-        in = arm_nn_read_q7x4(input);
			
 
				+        in = read_q7x4_ia(&input);
			
 
				 
			
 
				         /* extract the first bit */
			
 
				         buf = __ROR(in & 0x80808080, 7);
			
--- a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
+++ b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_add_s8.c
@@ -92,8 +92,8 @@ arm_elementwise_add_s8(const int8_t *input_1_vect,
 
				   {
			
 
				     /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
			
 
				        intrinsic */
			
 
				-    input_1_vect = read_and_pad_reordered((q7_t *)input_1_vect, &b_1, &a_1);
			
 
				-    input_2_vect = read_and_pad_reordered((q7_t *)input_2_vect, &b_2, &a_2);
			
 
				+    input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
			
 
				+    input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
			
 
				 
			
 
				     a_1 = __SADD16(a_1, offset_1_packed);
			
 
				     b_1 = __SADD16(b_1, offset_1_packed);
			
--- a/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
+++ b/CMSIS/NN/Source/BasicMathFunctions/arm_elementwise_mul_s8.c
@@ -83,8 +83,8 @@ arm_elementwise_mul_s8(const int8_t *input_1_vect,
 
				   {
			
 
				     /* 4 outputs are calculated in one loop. The order of calculation is follows the order of output sign extension
			
 
				        intrinsic */
			
 
				-    input_1_vect = read_and_pad_reordered((q7_t *)input_1_vect, &b_1, &a_1);
			
 
				-    input_2_vect = read_and_pad_reordered((q7_t *)input_2_vect, &b_2, &a_2);
			
 
				+    input_1_vect = read_and_pad_reordered(input_1_vect, &b_1, &a_1);
			
 
				+    input_2_vect = read_and_pad_reordered(input_2_vect, &b_2, &a_2);
			
 
				 
			
 
				     a_1 = __SADD16(a_1, offset_1_packed);
			
 
				     b_1 = __SADD16(b_1, offset_1_packed);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_HWC_q7_fast_nonsquare.c
@@ -157,7 +157,7 @@ arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
 
				                 q31_t     inA1, inA2;
			
 
				                 q31_t     inB1, inB2;
			
 
				 
			
 
				-                pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1, &inA2);
			
 
				+                pA = read_and_pad_reordered(pA, &inA1, &inA2);
			
 
				 
			
 
				                 inB1 = *__SIMD32(pB)++;
			
 
				                 sum = __SMLAD(inA1, inB1, sum);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c
@@ -127,18 +127,18 @@ arm_status arm_convolve_1x1_s8_fast(const q7_t *input,
 
				             q31_t sum = bias[i_ch_out];
			
 
				 
			
 
				             /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
			
 
				-            q15_t *ip_as_col = buffer_a;
			
 
				+            const q15_t *ip_as_col = buffer_a;
			
 
				             uint16_t col_count = (input_ch * DIM_KER_X * DIM_KER_Y) >> 2;
			
 
				 
			
 
				             while (col_count)
			
 
				             {
			
 
				                 q31_t ker_a1, ker_a2;
			
 
				                 q31_t in_b1, in_b2;
			
 
				-                ker_a = (const q7_t *)read_and_pad_reordered((void *)ker_a, &ker_a1, &ker_a2);
			
 
				+                ker_a = read_and_pad_reordered(ker_a, &ker_a1, &ker_a2);
			
 
				 
			
 
				-                in_b1 = arm_nn_read_q15x2_ia((const q15_t **)&ip_as_col);
			
 
				+                in_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
			
 
				                 sum = __SMLAD(ker_a1, in_b1, sum);
			
 
				-                in_b2 = arm_nn_read_q15x2_ia((const q15_t **)&ip_as_col);
			
 
				+                in_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
			
 
				                 sum = __SMLAD(ker_a2, in_b2, sum);
			
 
				 
			
 
				                 col_count--;
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_RGB.c
@@ -204,7 +204,7 @@ arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
 
				                 q31_t     inA1, inA2;
			
 
				                 q31_t     inB1, inB2;
			
 
				 
			
 
				-                pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
			
 
				+                pA = read_and_pad(pA, &inA1, &inA2);
			
 
				 
			
 
				                 inB1 = *__SIMD32(pB)++;
			
 
				                 sum = __SMLAD(inA1, inB1, sum);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic.c
@@ -161,7 +161,7 @@ arm_convolve_HWC_q7_basic(const q7_t * Im_in,
 
				                 q31_t     inA1, inA2;
			
 
				                 q31_t     inB1, inB2;
			
 
				 
			
 
				-                pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
			
 
				+                pA = read_and_pad(pA, &inA1, &inA2);
			
 
				 
			
 
				                 inB1 = *__SIMD32(pB)++;
			
 
				                 sum = __SMLAD(inA1, inB1, sum);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_basic_nonsquare.c
@@ -159,7 +159,7 @@ arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
 
				                 q31_t     inA1, inA2;
			
 
				                 q31_t     inB1, inB2;
			
 
				 
			
 
				-                pA = (q7_t *) read_and_pad((void *)pA, &inA1, &inA2);
			
 
				+                pA = read_and_pad(pA, &inA1, &inA2);
			
 
				 
			
 
				                 inB1 = *__SIMD32(pB)++;
			
 
				                 sum = __SMLAD(inA1, inB1, sum);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast.c
@@ -330,7 +330,7 @@ arm_convolve_HWC_q7_fast(const q7_t * Im_in,
 
				                 q31_t     inA1, inA2;
			
 
				                 q31_t     inB1, inB2;
			
 
				 
			
 
				-                pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA1, &inA2);
			
 
				+                pA = read_and_pad_reordered(pA, &inA1, &inA2);
			
 
				 
			
 
				                 inB1 = *__SIMD32(pB)++;
			
 
				                 sum = __SMLAD(inA1, inB1, sum);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_HWC_q7_fast_nonsquare.c
@@ -302,7 +302,7 @@ arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
 
				                 q31_t     inA1, inA2;
			
 
				                 q31_t     inB1, inB2;
			
 
				 
			
 
				-                pA = (const q7_t *)read_and_pad_reordered((void *)pA, &inA1, &inA2);
			
 
				+                pA = read_and_pad_reordered(pA, &inA1, &inA2);
			
 
				 
			
 
				                 inB1 = *__SIMD32(pB)++;
			
 
				                 sum = __SMLAD(inA1, inB1, sum);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c
@@ -137,7 +137,7 @@ arm_status arm_convolve_s8(const q7_t *input,
 
				             q31_t sum = bias[i];
			
 
				 
			
 
				             /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
			
 
				-            q15_t *ip_as_col = buffer_a;
			
 
				+            const q15_t *ip_as_col = buffer_a;
			
 
				 
			
 
				             /* 4 multiply and accumulates are done in one loop. */
			
 
				             uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
			
@@ -147,11 +147,11 @@ arm_status arm_convolve_s8(const q7_t *input,
 
				                 q31_t ker_a1, ker_a2;
			
 
				                 q31_t ip_b1, ip_b2;
			
 
				 
			
 
				-                ker_a = (q7_t *)read_and_pad((void *)ker_a, &ker_a1, &ker_a2);
			
 
				+                ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
			
 
				 
			
 
				-                ip_b1 = arm_nn_read_q15x2_ia((const q15_t **)&ip_as_col);
			
 
				+                ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
			
 
				                 sum = __SMLAD(ker_a1, ip_b1, sum);
			
 
				-                ip_b2 = arm_nn_read_q15x2_ia((const q15_t **)&ip_as_col);
			
 
				+                ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
			
 
				                 sum = __SMLAD(ker_a2, ip_b2, sum);
			
 
				 
			
 
				                 col_count--;
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15.c
@@ -76,8 +76,8 @@ q7_t     *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
 
				             q31_t     inB1 = *__SIMD32(pB)++;
			
 
				             q31_t     inB2 = *__SIMD32(pB2)++;
			
 
				 
			
 
				-            pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12);
			
 
				-            pA2 = (q7_t *) read_and_pad((void *)pA2, &inA21, &inA22);
			
 
				+            pA = read_and_pad(pA, &inA11, &inA12);
			
 
				+            pA2 = read_and_pad(pA2, &inA21, &inA22);
			
 
				 
			
 
				             sum = __SMLAD(inA11, inB1, sum);
			
 
				             sum2 = __SMLAD(inA11, inB2, sum2);
			
@@ -136,7 +136,7 @@ q7_t     *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
 
				             q31_t     inB1 = *__SIMD32(pB)++;
			
 
				             q31_t     inB2 = *__SIMD32(pB2)++;
			
 
				 
			
 
				-            pA = (q7_t *) read_and_pad((void *)pA, &inA11, &inA12);
			
 
				+            pA = read_and_pad(pA, &inA11, &inA12);
			
 
				 
			
 
				             sum = __SMLAD(inA11, inB1, sum);
			
 
				             sum2 = __SMLAD(inA11, inB2, sum2);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_q7_q15_reordered.c
@@ -76,8 +76,8 @@ q7_t     *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
 
				             q31_t     inB1 = *__SIMD32(pB)++;
			
 
				             q31_t     inB2 = *__SIMD32(pB2)++;
			
 
				 
			
 
				-            pA = (q7_t *) read_and_pad_reordered((void *)pA, &inA11, &inA12);
			
 
				-            pA2 = (q7_t *) read_and_pad_reordered((void *)pA2, &inA21, &inA22);
			
 
				+            pA = read_and_pad_reordered(pA, &inA11, &inA12);
			
 
				+            pA2 = read_and_pad_reordered(pA2, &inA21, &inA22);
			
 
				 
			
 
				             sum = __SMLAD(inA11, inB1, sum);
			
 
				             sum2 = __SMLAD(inA11, inB2, sum2);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16.c
@@ -79,8 +79,8 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
 
				             q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
			
 
				             q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
			
 
				 
			
 
				-            ip_a0 = (q7_t *)read_and_pad((void *)ip_a0, &a01, &a02);
			
 
				-            ip_a1 = (q7_t *)read_and_pad((void *)ip_a1, &a11, &a12);
			
 
				+            ip_a0 = read_and_pad(ip_a0, &a01, &a02);
			
 
				+            ip_a1 = read_and_pad(ip_a1, &a11, &a12);
			
 
				 
			
 
				             ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
			
 
				             ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
			
@@ -163,7 +163,7 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
 
				             q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
			
 
				             q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
			
 
				 
			
 
				-            ip_a0 = (q7_t *)read_and_pad((void *)ip_a0, &a01, &a02);
			
 
				+            ip_a0 = read_and_pad(ip_a0, &a01, &a02);
			
 
				 
			
 
				             ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
			
 
				             ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c
@@ -83,8 +83,8 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
 
				             q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
			
 
				             q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
			
 
				 
			
 
				-            ip_a0 = (q7_t *)read_and_pad_reordered((void *)ip_a0, &a01, &a02);
			
 
				-            ip_a1 = (q7_t *)read_and_pad_reordered((void *)ip_a1, &a11, &a12);
			
 
				+            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
			
 
				+            ip_a1 = read_and_pad_reordered(ip_a1, &a11, &a12);
			
 
				 
			
 
				             ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
			
 
				             ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
			
@@ -167,7 +167,7 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
 
				             q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
			
 
				             q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
			
 
				 
			
 
				-            ip_a0 = (q7_t *)read_and_pad_reordered((void *)ip_a0, &a01, &a02);
			
 
				+            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
			
 
				 
			
 
				             ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
			
 
				             ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
			
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c
@@ -100,8 +100,8 @@ arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
 
				         while (colCnt)
			
 
				         {
			
 
				             q31_t     inV, inM11, inM12, inM21, inM22;
			
 
				-            pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
			
 
				-            pB2 = (q7_t *) read_and_pad((void *)pB2, &inM21, &inM22);
			
 
				+            pB = read_and_pad(pB, &inM11, &inM12);
			
 
				+            pB2 = read_and_pad(pB2, &inM21, &inM22);
			
 
				 
			
 
				             inV = *__SIMD32(pA)++;
			
 
				 
			
@@ -148,7 +148,7 @@ arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
 
				         {
			
 
				             q31_t     inV1, inV2, inM11, inM12;
			
 
				 
			
 
				-            pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
			
 
				+            pB = read_and_pad(pB, &inM11, &inM12);
			
 
				 
			
 
				             inV1 = *__SIMD32(pA)++;
			
 
				             sum = __SMLAD(inV1, inM11, sum);
			
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c
@@ -282,7 +282,7 @@ arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
 
				         {
			
 
				             q31_t     inV1, inV2, inM11, inM12;
			
 
				 
			
 
				-            pB = (q7_t *) read_and_pad((void *)pB, &inM11, &inM12);
			
 
				+            pB = read_and_pad(pB, &inM11, &inM12);
			
 
				 
			
 
				             inV1 = *__SIMD32(pA)++;
			
 
				             sum = __SMLAD(inV1, inM11, sum);
			
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c
@@ -98,8 +98,8 @@ arm_fully_connected_q7(const q7_t * pV,
 
				         while (colCnt)
			
 
				         {
			
 
				             q31_t     inV, inM11, inM12, inM21, inM22;
			
 
				-            pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12);
			
 
				-            pB2 = (q7_t *) read_and_pad_reordered((void *)pB2, &inM21, &inM22);
			
 
				+            pB = read_and_pad_reordered(pB, &inM11, &inM12);
			
 
				+            pB2 = read_and_pad_reordered(pB2, &inM21, &inM22);
			
 
				 
			
 
				             inV = *__SIMD32(pA)++;
			
 
				 
			
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7_opt.c
@@ -104,7 +104,7 @@
 
				    *  | a17 | a27 | a37 | a47 |
			
 
				    *
			
 
				    *  For the left-over rows, we do 1x1 computation, so the data remains
			
 
				-   *  as its original order. 
			
 
				+   *  as its original order.
			
 
				    *
			
 
				    *  So the stored weight matrix looks like this:
			
 
				    *
			
@@ -131,9 +131,9 @@ arm_fully_connected_q7_opt(const q7_t * pV,
 
				                            const uint16_t dim_vec,
			
 
				                            const uint16_t num_of_rows,
			
 
				                            const uint16_t bias_shift,
			
 
				-                           const uint16_t out_shift, 
			
 
				-                           const q7_t * bias, 
			
 
				-                           q7_t * pOut, 
			
 
				+                           const uint16_t out_shift,
			
 
				+                           const q7_t * bias,
			
 
				+                           q7_t * pOut,
			
 
				                            q15_t * vec_buffer)
			
 
				 {
			
 
				 
			
@@ -343,7 +343,7 @@ arm_fully_connected_q7_opt(const q7_t * pV,
 
				         {
			
 
				             q31_t     inV1, inV2, inM11, inM12;
			
 
				 
			
 
				-            pB = (q7_t *) read_and_pad_reordered((void *)pB, &inM11, &inM12);
			
 
				+            pB = read_and_pad_reordered(pB, &inM11, &inM12);
			
 
				 
			
 
				             inV1 = *__SIMD32(pA)++;
			
 
				             sum = __SMLAD(inV1, inM11, sum);
			
--- a/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
+++ b/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_s8.c
@@ -107,8 +107,8 @@ arm_fully_connected_s8(const int8_t *input,
 
				             while (colCnt)
			
 
				             {
			
 
				                 q31_t inV, inM11, inM12, inM21, inM22;
			
 
				-                pB = read_and_pad_reordered_with_offset((q7_t *)pB, &inM11, &inM12, foffset);
			
 
				-                pB2 = read_and_pad_reordered_with_offset((q7_t *)pB2, &inM21, &inM22, foffset);
			
 
				+                pB = read_and_pad_reordered_with_offset(pB, &inM11, &inM12, foffset);
			
 
				+                pB2 = read_and_pad_reordered_with_offset(pB2, &inM21, &inM22, foffset);
			
 
				 
			
 
				                 inV = read_q15x2_ia(&pA);
			
 
				                 inV = __QADD16(inV, ioffset);
			
@@ -172,7 +172,7 @@ arm_fully_connected_s8(const int8_t *input,
 
				             {
			
 
				                 q31_t inV, inM11, inM12;
			
 
				 
			
 
				-                pB = read_and_pad_reordered_with_offset((q7_t *)pB, &inM11, &inM12, foffset);
			
 
				+                pB = read_and_pad_reordered_with_offset(pB, &inM11, &inM12, foffset);
			
 
				 
			
 
				                 inV = read_q15x2_ia(&pA);
			
 
				                 inV = __QADD16(inV, ioffset);
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
@@ -39,17 +39,17 @@
 
				  * @{
			
 
				  */
			
 
				 
			
 
				-void arm_nn_accumulate_q7_to_q15(q15_t * pDst, const q7_t * pSrc, uint32_t length)
			
 
				+void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
			
 
				 {
			
 
				-    q15_t    *pCnt = pDst;
			
 
				-    const q7_t     *pV = pSrc;
			
 
				-    q31_t     v1, v2, vo1, vo2;
			
 
				-    uint16_t  cnt = length >> 2;
			
 
				-    q31_t     in;
			
 
				+    q15_t *pCnt = pDst;
			
 
				+    const q7_t *pV = pSrc;
			
 
				+    q31_t v1, v2, vo1, vo2;
			
 
				+    int32_t cnt = length >> 2;
			
 
				+    q31_t in;
			
 
				 
			
 
				-    while (cnt > 0u)
			
 
				+    while (cnt > 0l)
			
 
				     {
			
 
				-        q31_t     value = *__SIMD32(pV)++;
			
 
				+        q31_t value = arm_nn_read_q7x4_ia(&pV);
			
 
				         v1 = __SXTB16(__ROR(value, 8));
			
 
				         v2 = __SXTB16(value);
			
 
				 #ifndef ARM_MATH_BIG_ENDIAN
			
@@ -68,12 +68,12 @@ void arm_nn_accumulate_q7_to_q15(q15_t * pDst, const q7_t * pSrc, uint32_t lengt
 
				         write_q15x2_ia(&pCnt, __QADD16(vo1, in));
			
 
				 
			
 
				         in = arm_nn_read_q15x2(pCnt);
			
 
				-        write_q15x2_ia(&pCnt,__QADD16(vo2, in));
			
 
				+        write_q15x2_ia(&pCnt, __QADD16(vo2, in));
			
 
				 
			
 
				         cnt--;
			
 
				     }
			
 
				     cnt = length & 0x3;
			
 
				-    while (cnt > 0u)
			
 
				+    while (cnt > 0l)
			
 
				     {
			
 
				         *pCnt++ += *pV++;
			
 
				         cnt--;
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_no_shift.c
@@ -44,7 +44,7 @@
 
				  * @param[in]       *pSrc points to the Q7 input vector
			
 
				  * @param[out]      *pDst points to the Q15 output vector
			
 
				  * @param[in]       blockSize length of the input vector
			
 
				- * 
			
 
				+ *
			
 
				  * @details
			
 
				  *
			
 
				  * This function does the q7 to q15 expansion with re-ordering
			
@@ -94,7 +94,7 @@ void arm_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t
 
				     {
			
 
				         /* C = (q15_t) A << 8 */
			
 
				         /* convert from q7 to q15 and then store the results in the destination buffer */
			
 
				-        in = *__SIMD32(pIn)++;
			
 
				+        in = arm_nn_read_q7x4_ia(&pIn);
			
 
				 
			
 
				         /* rotatate in by 8 and extend two q7_t values to q15_t values */
			
 
				         in1 = __SXTB16(__ROR(in, 8));