hace 5 años · d7b45647c8
--- a/CMSIS/NN/Include/arm_nn_tables.h
+++ b/CMSIS/NN/Include/arm_nn_tables.h
@@ -42,15 +42,15 @@ extern const q7_t sigmoidTable_q7[256];
 
				 extern const q7_t tanhTable_q7[256];
			
 
				 extern const q15_t tanhTable_q15[256];
			
 
				 
			
 
				-  /**
			
 
				-   * @brief 2-way tables for various activation functions
			
 
				-   *
			
 
				-   * 2-way table, H table for value larger than 1/4
			
 
				-   * L table for value smaller than 1/4, H table for remaining
			
 
				-   * We have this only for the q15_t version. It does not make
			
 
				-   * sense to have it for q7_t type
			
 
				-   */
			
 
				+/**
			
 
				+ * @brief 2-way tables for various activation functions
			
 
				+ *
			
 
				+ * 2-way table, H table for value larger than 1/4
			
 
				+ * L table for value smaller than 1/4, H table for remaining
			
 
				+ * We have this only for the q15_t version. It does not make
			
 
				+ * sense to have it for q7_t type
			
 
				+ */
			
 
				 extern const q15_t sigmoidHTable_q15[192];
			
 
				 extern const q15_t sigmoidLTable_q15[128];
			
 
				 
			
 
				-#endif                          /*  ARM_NN_TABLES_H */
			
 
				+#endif /*  ARM_NN_TABLES_H */
			
--- a/CMSIS/NN/Include/arm_nn_types.h
+++ b/CMSIS/NN/Include/arm_nn_types.h
@@ -28,7 +28,6 @@
 
				  * Target Processor:  Cortex-M cores
			
 
				  * -------------------------------------------------------------------- */
			
 
				 
			
 
				-
			
 
				 #ifndef _ARM_NN_TYPES_H
			
 
				 #define _ARM_NN_TYPES_H
			
 
				 
			
@@ -37,21 +36,22 @@
 
				 /** CMSIS-NN object to contain the width and height of a tile */
			
 
				 typedef struct
			
 
				 {
			
 
				-    int32_t w;  /**< Width */
			
 
				-    int32_t h;  /**< Height */
			
 
				+    int32_t w; /**< Width */
			
 
				+    int32_t h; /**< Height */
			
 
				 } cmsis_nn_tile;
			
 
				 
			
 
				 /** CMSIS-NN object used for the function context. */
			
 
				 typedef struct
			
 
				 {
			
 
				-    void *buf;      /**< Pointer to a buffer needed for the optimization */
			
 
				-    int32_t size;   /**< Buffer size */
			
 
				+    void *buf;    /**< Pointer to a buffer needed for the optimization */
			
 
				+    int32_t size; /**< Buffer size */
			
 
				 } cmsis_nn_context;
			
 
				 
			
 
				 /** CMSIS-NN object to contain the dimensions of the tensors */
			
 
				 typedef struct
			
 
				 {
			
 
				-    int32_t n; /**< Generic dimension to contain either the batch size or output channels. Please refer to the function documentation for more information */
			
 
				+    int32_t n; /**< Generic dimension to contain either the batch size or output channels.
			
 
				+                     Please refer to the function documentation for more information */
			
 
				     int32_t h; /**< Height */
			
 
				     int32_t w; /**< Width */
			
 
				     int32_t c; /**< Input channels */
			
@@ -81,39 +81,39 @@ typedef struct
 
				 /** CMSIS-NN object for the convolution layer parameters */
			
 
				 typedef struct
			
 
				 {
			
 
				-    int32_t             input_offset;   /**< Zero value for the input tensor */
			
 
				-    int32_t             output_offset;  /**< Zero value for the output tensor */
			
 
				-    cmsis_nn_tile       stride;
			
 
				-    cmsis_nn_tile       padding;
			
 
				-    cmsis_nn_tile       dilation;
			
 
				+    int32_t input_offset;  /**< Zero value for the input tensor */
			
 
				+    int32_t output_offset; /**< Zero value for the output tensor */
			
 
				+    cmsis_nn_tile stride;
			
 
				+    cmsis_nn_tile padding;
			
 
				+    cmsis_nn_tile dilation;
			
 
				     cmsis_nn_activation activation;
			
 
				 } cmsis_nn_conv_params;
			
 
				 
			
 
				 /** CMSIS-NN object for Depthwise convolution layer parameters */
			
 
				 typedef struct
			
 
				 {
			
 
				-    int32_t             input_offset;   /**< Zero value for the input tensor */
			
 
				-    int32_t             output_offset;  /**< Zero value for the output tensor */
			
 
				-    int32_t             ch_mult;        /**< Channel Multiplier. ch_mult * in_ch = out_ch */
			
 
				-    cmsis_nn_tile       stride;
			
 
				-    cmsis_nn_tile       padding;
			
 
				-    cmsis_nn_tile       dilation;
			
 
				+    int32_t input_offset;  /**< Zero value for the input tensor */
			
 
				+    int32_t output_offset; /**< Zero value for the output tensor */
			
 
				+    int32_t ch_mult;       /**< Channel Multiplier. ch_mult * in_ch = out_ch */
			
 
				+    cmsis_nn_tile stride;
			
 
				+    cmsis_nn_tile padding;
			
 
				+    cmsis_nn_tile dilation;
			
 
				     cmsis_nn_activation activation;
			
 
				 } cmsis_nn_dw_conv_params;
			
 
				 /** CMSIS-NN object for pooling layer parameters */
			
 
				 typedef struct
			
 
				 {
			
 
				-    cmsis_nn_tile       stride;
			
 
				-    cmsis_nn_tile       padding;
			
 
				+    cmsis_nn_tile stride;
			
 
				+    cmsis_nn_tile padding;
			
 
				     cmsis_nn_activation activation;
			
 
				 } cmsis_nn_pool_params;
			
 
				 
			
 
				 /** CMSIS-NN object for Fully Connected layer parameters */
			
 
				 typedef struct
			
 
				 {
			
 
				-    int32_t             input_offset;   /**< Zero value for the input tensor */
			
 
				-    int32_t             filter_offset;   /**< Zero value for the filter tensor */
			
 
				-    int32_t             output_offset;  /**< Zero value for the output tensor */
			
 
				+    int32_t input_offset;  /**< Zero value for the input tensor */
			
 
				+    int32_t filter_offset; /**< Zero value for the filter tensor */
			
 
				+    int32_t output_offset; /**< Zero value for the output tensor */
			
 
				     cmsis_nn_activation activation;
			
 
				 } cmsis_nn_fc_params;
			
 
				 
			
@@ -121,12 +121,10 @@ typedef struct
 
				 typedef struct
			
 
				 {
			
 
				     int32_t rank;
			
 
				-    int32_t input_offset; /**< Zero value for the input tensor */
			
 
				+    int32_t input_offset;  /**< Zero value for the input tensor */
			
 
				     int32_t output_offset; /**< Zero value for the output tensor */
			
 
				     cmsis_nn_activation input_activation;
			
 
				     cmsis_nn_activation output_activation;
			
 
				 } cmsis_nn_svdf_params;
			
 
				 
			
 
				 #endif // _ARM_NN_TYPES_H
			
 
				-
			
 
				-
			
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -30,35 +30,33 @@
 
				 #ifndef _ARM_NNSUPPORTFUNCTIONS_H_
			
 
				 #define _ARM_NNSUPPORTFUNCTIONS_H_
			
 
				 
			
 
				-#include "arm_math_types.h"
			
 
				 #include "arm_common_tables.h"
			
 
				+#include "arm_math_types.h"
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				-extern    "C"
			
 
				-{
			
 
				+extern "C" {
			
 
				 #endif
			
 
				 
			
 
				-#define LEFT_SHIFT(_shift)  (_shift > 0 ? _shift : 0)
			
 
				+#define LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)
			
 
				 #define RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)
			
 
				-#define MASK_IF_ZERO(x)     (x) == 0 ? ~0 : 0
			
 
				+#define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0
			
 
				 #define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0
			
 
				 #define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b))
			
 
				 
			
 
				-#define MAX(A,B) ((A) > (B) ? (A) : (B))
			
 
				-#define MIN(A,B) ((A) < (B) ? (A) : (B))
			
 
				+#define MAX(A, B) ((A) > (B) ? (A) : (B))
			
 
				+#define MIN(A, B) ((A) < (B) ? (A) : (B))
			
 
				 #define CLAMP(x, h, l) MAX(MIN((x), (h)), (l))
			
 
				 
			
 
				 /**
			
 
				  * @brief Union for SIMD access of q31/q15/q7 types
			
 
				  */
			
 
				-union arm_nnword
			
 
				-{
			
 
				-    q31_t     word;
			
 
				-               /**< q31 type */
			
 
				-    q15_t     half_words[2];
			
 
				-               /**< q15 type */
			
 
				-    q7_t      bytes[4];
			
 
				-               /**< q7 type */
			
 
				+union arm_nnword {
			
 
				+    q31_t word;
			
 
				+    /**< q31 type */
			
 
				+    q15_t half_words[2];
			
 
				+    /**< q15 type */
			
 
				+    q7_t bytes[4];
			
 
				+    /**< q7 type */
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -66,14 +64,13 @@ union arm_nnword
 
				  */
			
 
				 struct arm_nn_double
			
 
				 {
			
 
				-  uint32_t low;
			
 
				-  int32_t high;
			
 
				+    uint32_t low;
			
 
				+    int32_t high;
			
 
				 };
			
 
				 
			
 
				-union arm_nn_long_long
			
 
				-{
			
 
				-  int64_t long_long;
			
 
				-  struct arm_nn_double word;
			
 
				+union arm_nn_long_long {
			
 
				+    int64_t long_long;
			
 
				+    struct arm_nn_double word;
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -118,7 +115,7 @@ void arm_nn_add_q7(const q7_t *input, q31_t *output, uint32_t block_size);
 
				  * @return none.
			
 
				  *
			
 
				  */
			
 
				-void arm_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint32_t blockSize);
			
 
				+void arm_q7_to_q15_reordered_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize);
			
 
				 
			
 
				 /**
			
 
				  * @brief Converts the elements from a q7 vector to a q15 vector with an added offset
			
@@ -300,11 +297,13 @@ arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
 
				 *
			
 
				 * @param[in]  lhs                Pointer to the LHS input matrix
			
 
				 * @param[in]  rhs                Pointer to the RHS input matrix
			
 
				-* @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
			
 
				+* @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of output
			
 
				+*                                columns (or RHS input rows)
			
 
				 * @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
			
 
				-* @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to
			
 
				-*                                the number of output columns (or RHS input rows)
			
 
				-* @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to
			
 
				+* @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
			
 
				+*                                The length of this vector is equal to the number of output columns (or RHS input rows)
			
 
				+* @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length of
			
 
				+*                                this vector is equal to
			
 
				 *                                the number of output columns (or RHS input rows)
			
 
				 * @param[in]  lhs_rows           Number of LHS input rows
			
 
				 * @param[in]  rhs_rows           Number of RHS input rows
			
@@ -338,8 +337,10 @@ arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
 
				  * @param[in]      rhs             Input right-hand side matrix (transposed)
			
 
				  * @param[in]      bias            Input bias
			
 
				  * @param[out]     dst             Output vector
			
 
				- * @param[in]      lhs_offset      Offset to be added to the input values of the left-hand side vector. Range: -127 to 128
			
 
				- * @param[in]      rhs_offset      Offset to be added to the input values of the right-hand side matrix. Range: -127 to 128
			
 
				+ * @param[in]      lhs_offset      Offset to be added to the input values of the left-hand side vector.
			
 
				+ *                                 Range: -127 to 128
			
 
				+ * @param[in]      rhs_offset      Offset to be added to the input values of the right-hand side matrix.
			
 
				+ *                                 Range: -127 to 128
			
 
				  * @param[in]      dst_offset      Offset to be added to the output values. Range: -127 to 128
			
 
				  * @param[in]      dst_multiplier  Output multiplier
			
 
				  * @param[in]      dst_shift       Output shift
			
@@ -454,12 +455,12 @@ q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
 
				  */
			
 
				 __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
			
 
				 {
			
 
				-  q31_t val;
			
 
				+    q31_t val;
			
 
				 
			
 
				-  memcpy(&val, *in_q15, 4);
			
 
				-  *in_q15 += 2;
			
 
				+    memcpy(&val, *in_q15, 4);
			
 
				+    *in_q15 += 2;
			
 
				 
			
 
				-  return (val);
			
 
				+    return (val);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -469,11 +470,11 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia(const q15_t **in_q15)
 
				  */
			
 
				 __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
			
 
				 {
			
 
				-  q31_t val;
			
 
				-  memcpy(&val, *in_q7, 4);
			
 
				-  *in_q7 += 4;
			
 
				+    q31_t val;
			
 
				+    memcpy(&val, *in_q7, 4);
			
 
				+    *in_q7 += 4;
			
 
				 
			
 
				-  return (val);
			
 
				+    return (val);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -483,10 +484,10 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
 
				  */
			
 
				 __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
			
 
				 {
			
 
				-  q31_t val;
			
 
				-  memcpy(&val, in_q15, 4);
			
 
				+    q31_t val;
			
 
				+    memcpy(&val, in_q15, 4);
			
 
				 
			
 
				-  return (val);
			
 
				+    return (val);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -496,10 +497,10 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
 
				  */
			
 
				 __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
			
 
				 {
			
 
				-  q31_t val;
			
 
				-  memcpy(&val, in_q7, 4);
			
 
				+    q31_t val;
			
 
				+    memcpy(&val, in_q7, 4);
			
 
				 
			
 
				-  return (val);
			
 
				+    return (val);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -509,91 +510,87 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4(const q7_t *in_q7)
 
				  * @param[in]       block_size  Number of bytes to copy.
			
 
				  *
			
 
				  */
			
 
				-__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst,
			
 
				-                                        const q7_t val,
			
 
				-                                        uint32_t block_size)
			
 
				+__STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t block_size)
			
 
				 {
			
 
				 #if defined(ARM_MATH_MVEI)
			
 
				-     __asm volatile (
			
 
				-        "   vdup.8                  q0, %[set_val]             \n"
			
 
				-        "   wlstp.8                 lr, %[cnt], 1f             \n"
			
 
				-        "2:                                                    \n"
			
 
				-        "   vstrb.8                 q0, [%[in]], 16            \n"
			
 
				-        "   letp                    lr, 2b                     \n"
			
 
				-        "1:                                                    \n"
			
 
				-        :[in] "+r"(dst)
			
 
				-        :[cnt] "r"(block_size), [set_val] "r"(val)
			
 
				-        :"q0", "memory", "r14");
			
 
				+    __asm volatile("   vdup.8                  q0, %[set_val]             \n"
			
 
				+                   "   wlstp.8                 lr, %[cnt], 1f             \n"
			
 
				+                   "2:                                                    \n"
			
 
				+                   "   vstrb.8                 q0, [%[in]], 16            \n"
			
 
				+                   "   letp                    lr, 2b                     \n"
			
 
				+                   "1:                                                    \n"
			
 
				+                   : [in] "+r"(dst)
			
 
				+                   : [cnt] "r"(block_size), [set_val] "r"(val)
			
 
				+                   : "q0", "memory", "r14");
			
 
				 #else
			
 
				     memset(dst, val, block_size);
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-#if defined (ARM_MATH_DSP)
			
 
				+#if defined(ARM_MATH_DSP)
			
 
				 
			
 
				 /**
			
 
				  * @brief read and expand one q7 word into two q15 words
			
 
				  */
			
 
				 
			
 
				-__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t * out1, q31_t * out2)
			
 
				+__STATIC_FORCEINLINE const q7_t *read_and_pad(const q7_t *source, q31_t *out1, q31_t *out2)
			
 
				 {
			
 
				-        q31_t     inA = arm_nn_read_q7x4_ia(&source);
			
 
				-        q31_t     inAbuf1 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				-        q31_t     inAbuf2 = __SXTB16(inA);
			
 
				+    q31_t inA = arm_nn_read_q7x4_ia(&source);
			
 
				+    q31_t inAbuf1 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				+    q31_t inAbuf2 = __SXTB16(inA);
			
 
				 
			
 
				 #ifndef ARM_MATH_BIG_ENDIAN
			
 
				-  *out2 = (int32_t) (__PKHTB (inAbuf1, inAbuf2, 16));
			
 
				-  *out1 = (int32_t) (__PKHBT (inAbuf2, inAbuf1, 16));
			
 
				+    *out2 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
			
 
				+    *out1 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
			
 
				 #else
			
 
				-  *out1 = (int32_t) (__PKHTB(inAbuf1, inAbuf2, 16));
			
 
				-  *out2 = (int32_t) (__PKHBT(inAbuf2, inAbuf1, 16));
			
 
				+    *out1 = (int32_t)(__PKHTB(inAbuf1, inAbuf2, 16));
			
 
				+    *out2 = (int32_t)(__PKHBT(inAbuf2, inAbuf1, 16));
			
 
				 #endif
			
 
				 
			
 
				-        return source;
			
 
				+    return source;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				  * @brief read and expand one q7 word into two q15 words with reordering
			
 
				  */
			
 
				 
			
 
				-__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t * out1, q31_t * out2)
			
 
				+__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered(const q7_t *source, q31_t *out1, q31_t *out2)
			
 
				 {
			
 
				-        q31_t     inA = arm_nn_read_q7x4_ia(&source);
			
 
				+    q31_t inA = arm_nn_read_q7x4_ia(&source);
			
 
				 #ifndef ARM_MATH_BIG_ENDIAN
			
 
				-        *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				-        *out1 = __SXTB16(inA);
			
 
				+    *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				+    *out1 = __SXTB16(inA);
			
 
				 #else
			
 
				-        *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				-        *out2 = __SXTB16(inA);
			
 
				+    *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				+    *out2 = __SXTB16(inA);
			
 
				 #endif
			
 
				 
			
 
				-        return source;
			
 
				+    return source;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				  * @brief read and expand one q7 word into two q15 words with reordering and add an offset
			
 
				  */
			
 
				-__STATIC_FORCEINLINE const q7_t *read_and_pad_reordered_with_offset(const q7_t *source, q31_t * out1, q31_t * out2, q31_t offset)
			
 
				+__STATIC_FORCEINLINE const q7_t *
			
 
				+read_and_pad_reordered_with_offset(const q7_t *source, q31_t *out1, q31_t *out2, q31_t offset)
			
 
				 {
			
 
				-        q31_t     inA = arm_nn_read_q7x4_ia(&source);
			
 
				+    q31_t inA = arm_nn_read_q7x4_ia(&source);
			
 
				 
			
 
				 #ifndef ARM_MATH_BIG_ENDIAN
			
 
				-        *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				-        *out1 = __SXTB16(inA);
			
 
				+    *out2 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				+    *out1 = __SXTB16(inA);
			
 
				 #else
			
 
				-        *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				-        *out2 = __SXTB16(inA);
			
 
				+    *out1 = __SXTB16(__ROR((uint32_t)inA, 8));
			
 
				+    *out2 = __SXTB16(inA);
			
 
				 #endif
			
 
				-        *out1 = __QADD16(*out1,offset);
			
 
				-        *out2 = __QADD16(*out2,offset);
			
 
				+    *out1 = __QADD16(*out1, offset);
			
 
				+    *out2 = __QADD16(*out2, offset);
			
 
				 
			
 
				-        return source;
			
 
				+    return source;
			
 
				 }
			
 
				 
			
 
				 #endif
			
 
				 
			
 
				-
			
 
				-
			
 
				 /**
			
 
				  * @defgroup NNBasicMath Basic Math Functions for Neural Network Computation
			
 
				  *
			
@@ -616,12 +613,7 @@ __STATIC_FORCEINLINE const q7_t *read_and_pad_reordered_with_offset(const q7_t *
 
				  * Results outside of the allowable q15 range [0x8000 0x7FFF] will be saturated.
			
 
				  */
			
 
				 
			
 
				-void arm_nn_mult_q15(
			
 
				-  q15_t * pSrcA,
			
 
				-  q15_t * pSrcB,
			
 
				-  q15_t * pDst,
			
 
				-  const uint16_t out_shift,
			
 
				-  uint32_t blockSize);
			
 
				+void arm_nn_mult_q15(q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize);
			
 
				 
			
 
				 /**
			
 
				  * @brief           q7 vector multiplication with variable output shifts
			
@@ -638,34 +630,27 @@ void arm_nn_mult_q15(
 
				  * Results outside of the allowable q7 range [0x80 0x7F] will be saturated.
			
 
				  */
			
 
				 
			
 
				-void arm_nn_mult_q7(
			
 
				-  q7_t * pSrcA,
			
 
				-  q7_t * pSrcB,
			
 
				-  q7_t * pDst,
			
 
				-  const uint16_t out_shift,
			
 
				-  uint32_t blockSize);
			
 
				+void arm_nn_mult_q7(q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize);
			
 
				 
			
 
				 /**
			
 
				  * @brief macro for adding rounding offset
			
 
				  */
			
 
				 #ifndef ARM_NN_TRUNCATE
			
 
				-    #define NN_ROUND(out_shift) ( (0x1u << out_shift) >> 1 )
			
 
				+#define NN_ROUND(out_shift) ((0x1u << out_shift) >> 1)
			
 
				 #else
			
 
				-    #define NN_ROUND(out_shift) 0
			
 
				+#define NN_ROUND(out_shift) 0
			
 
				 #endif
			
 
				 
			
 
				 // Macros for shortening quantization functions' names and avoid long lines
			
 
				-#define MUL_SAT(a, b)  arm_nn_doubling_high_mult((a), (b))
			
 
				+#define MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b))
			
 
				 #define MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b))
			
 
				 #define MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b))
			
 
				 
			
 
				-
			
 
				 #define DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b))
			
 
				 #define DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b))
			
 
				 
			
 
				-
			
 
				-#define EXP_ON_NEG(x)  arm_nn_exp_on_negative_values((x))
			
 
				-#define ONE_OVER1(x)   arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
			
 
				+#define EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x))
			
 
				+#define ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
			
 
				 
			
 
				 /**
			
 
				  * @brief           Saturating doubling high multiply. Result matches
			
@@ -690,7 +675,7 @@ __STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult(const q31_t m1, const q31_t
 
				 
			
 
				     // Utilize all of the upper 32 bits. This is the doubling step
			
 
				     // as well.
			
 
				-    result = (int32_t) (mult / (1ll << 31));
			
 
				+    result = (int32_t)(mult / (1ll << 31));
			
 
				 
			
 
				     if ((m1 == m2) && (m1 == (int32_t)Q31_MIN))
			
 
				     {
			
@@ -774,9 +759,8 @@ __STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two(const q31_t dividend, c
 
				  */
			
 
				 __STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multiplier, const q31_t shift)
			
 
				 {
			
 
				-  return arm_nn_divide_by_power_of_two(
			
 
				-      arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier),
			
 
				-      RIGHT_SHIFT(shift));
			
 
				+    return arm_nn_divide_by_power_of_two(arm_nn_doubling_high_mult_no_sat(val * (1 << LEFT_SHIFT(shift)), multiplier),
			
 
				+                                         RIGHT_SHIFT(shift));
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -786,22 +770,18 @@ __STATIC_FORCEINLINE q31_t arm_nn_requantize(const q31_t val, const q31_t multip
 
				  * @param[in]       block_size  Number of bytes to copy.
			
 
				  *
			
 
				  */
			
 
				-__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst,
			
 
				-                                        const q7_t *__RESTRICT src,
			
 
				-                                        uint32_t block_size)
			
 
				+__STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size)
			
 
				 {
			
 
				 #if defined(ARM_MATH_MVEI)
			
 
				-     __asm volatile (
			
 
				-        "   wlstp.8                 lr, %[cnt], 1f             \n"
			
 
				-        "2:                                                    \n"
			
 
				-        "   vldrb.8                 q0, [%[in]], 16            \n"
			
 
				-        "   vstrb.8                 q0, [%[out]], 16           \n"
			
 
				-        "   letp                    lr, 2b                     \n"
			
 
				-        "1:                                                    \n"
			
 
				-        :[in] "+r"(src)
			
 
				-        ,[out] "+r"(dst)
			
 
				-        :[cnt] "r"(block_size)
			
 
				-        :"q0", "memory", "r14");
			
 
				+    __asm volatile("   wlstp.8                 lr, %[cnt], 1f             \n"
			
 
				+                   "2:                                                    \n"
			
 
				+                   "   vldrb.8                 q0, [%[in]], 16            \n"
			
 
				+                   "   vstrb.8                 q0, [%[out]], 16           \n"
			
 
				+                   "   letp                    lr, 2b                     \n"
			
 
				+                   "1:                                                    \n"
			
 
				+                   : [in] "+r"(src), [out] "+r"(dst)
			
 
				+                   : [cnt] "r"(block_size)
			
 
				+                   : "q0", "memory", "r14");
			
 
				 #else
			
 
				     memcpy(dst, src, block_size);
			
 
				 #endif
			
@@ -830,10 +810,10 @@ __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve(const int32x4_t m1, co
 
				  */
			
 
				 __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t dividend, const q31_t exponent)
			
 
				 {
			
 
				-  const int32x4_t shift = vdupq_n_s32(-exponent);
			
 
				-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
			
 
				-  const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
			
 
				-  return vrshlq_s32(fixed_up_dividend, shift);
			
 
				+    const int32x4_t shift = vdupq_n_s32(-exponent);
			
 
				+    const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
			
 
				+    const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
			
 
				+    return vrshlq_s32(fixed_up_dividend, shift);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -847,33 +827,35 @@ __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve(const int32x4_t di
 
				  */
			
 
				 __STATIC_FORCEINLINE int32x4_t arm_requantize_mve(const int32x4_t val, const q31_t multiplier, const q31_t shift)
			
 
				 {
			
 
				-  return arm_divide_by_power_of_two_mve(
			
 
				-          arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier),
			
 
				-          RIGHT_SHIFT(shift));
			
 
				+    return arm_divide_by_power_of_two_mve(
			
 
				+        arm_doubling_high_mult_mve(vshlq_s32(val, vdupq_n_s32(LEFT_SHIFT(shift))), multiplier), RIGHT_SHIFT(shift));
			
 
				 }
			
 
				 
			
 
				 __STATIC_FORCEINLINE int32x4_t arm_doubling_high_mult_mve_32x4(const int32x4_t m1, const int32x4_t m2)
			
 
				 {
			
 
				-  return vqrdmulhq_s32(m1, m2);
			
 
				+    return vqrdmulhq_s32(m1, m2);
			
 
				 }
			
 
				 
			
 
				 __STATIC_FORCEINLINE int32x4_t arm_divide_by_power_of_two_mve_32x4(const int32x4_t dividend, const int32x4_t exponent)
			
 
				 {
			
 
				-  const int32x4_t shift = -exponent;
			
 
				-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
			
 
				-  const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
			
 
				-  return vrshlq_s32(fixed_up_dividend, shift);
			
 
				+    const int32x4_t shift = -exponent;
			
 
				+    const int32x4_t fixup = vshrq_n_s32(vandq_s32(dividend, shift), 31);
			
 
				+    const int32x4_t fixed_up_dividend = vqaddq_s32(dividend, fixup);
			
 
				+    return vrshlq_s32(fixed_up_dividend, shift);
			
 
				 }
			
 
				 
			
 
				-__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, const int32x4_t multiplier, const int32x4_t shift)
			
 
				+__STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val,
			
 
				+                                                       const int32x4_t multiplier,
			
 
				+                                                       const int32x4_t shift)
			
 
				 {
			
 
				-  const int32x4_t zz = vdupq_n_s32(0);
			
 
				-  const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
			
 
				+    const int32x4_t zz = vdupq_n_s32(0);
			
 
				+    const mve_pred16_t p = vcmpgtq_n_s32(shift, 0);
			
 
				 
			
 
				-  const int32x4_t left_shift = vpselq_s32(shift, zz, p);
			
 
				-  const int32x4_t right_shift = -vpselq_s32(zz, shift, p);
			
 
				+    const int32x4_t left_shift = vpselq_s32(shift, zz, p);
			
 
				+    const int32x4_t right_shift = -vpselq_s32(zz, shift, p);
			
 
				 
			
 
				-  return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier), right_shift);
			
 
				+    return arm_divide_by_power_of_two_mve_32x4(arm_doubling_high_mult_mve_32x4(vshlq_s32(val, left_shift), multiplier),
			
 
				+                                               right_shift);
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -881,22 +863,22 @@ __STATIC_FORCEINLINE int32x4_t arm_requantize_mve_32x4(const int32x4_t val, cons
 
				 
			
 
				 __STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values(int32_t val)
			
 
				 {
			
 
				-    int32_t mask  = 0;
			
 
				+    int32_t mask = 0;
			
 
				     int32_t shift = 24;
			
 
				 
			
 
				     const int32_t val_mod_minus_quarter = (val & ((1 << shift) - 1)) - (1 << shift);
			
 
				-    const int32_t remainder             = val_mod_minus_quarter - val;
			
 
				-    const int32_t x                     = (val_mod_minus_quarter << 5) + (1 << 28);
			
 
				-    const int32_t x2                    = MUL_SAT(x, x);
			
 
				+    const int32_t remainder = val_mod_minus_quarter - val;
			
 
				+    const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28);
			
 
				+    const int32_t x2 = MUL_SAT(x, x);
			
 
				 
			
 
				-    int32_t result = 1895147668 + MUL_SAT(1895147668, x +
			
 
				-        DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1));
			
 
				+    int32_t result = 1895147668 +
			
 
				+        MUL_SAT(1895147668, x + DIV_POW2(MUL_SAT(DIV_POW2(MUL_SAT(x2, x2), 2) + MUL_SAT(x2, x), 715827883) + x2, 1));
			
 
				 
			
 
				-#define SELECT_IF_NON_ZERO(x)                                     \
			
 
				-{                                                                 \
			
 
				-    mask   = MASK_IF_NON_ZERO(remainder & (1 << shift++));        \
			
 
				-    result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \
			
 
				-}
			
 
				+#define SELECT_IF_NON_ZERO(x)                                                                                          \
			
 
				+    {                                                                                                                  \
			
 
				+        mask = MASK_IF_NON_ZERO(remainder & (1 << shift++));                                                           \
			
 
				+        result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result);                                                  \
			
 
				+    }
			
 
				 
			
 
				     SELECT_IF_NON_ZERO(1672461947)
			
 
				     SELECT_IF_NON_ZERO(1302514674)