5 年之前 · a71c2b4022
--- a/CMSIS/NN/.clang-format
+++ b/CMSIS/NN/.clang-format
@@ -32,13 +32,14 @@ BraceWrapping:
 
				   AfterClass:      false
			
 
				   AfterControlStatement: true
			
 
				   AfterEnum:       true
			
 
				+  AfterExternBlock: false
			
 
				   AfterFunction:   true
			
 
				   AfterStruct:     true
			
 
				   AfterUnion:      true
			
 
				   BeforeElse:      true
			
 
				-  IndentBraces:    true
			
 
				+  IndentBraces:    false
			
 
				 BreakBeforeBinaryOperators: None
			
 
				-BreakBeforeBraces: Allman
			
 
				+BreakBeforeBraces: Custom
			
 
				 ColumnLimit:     120
			
 
				 DerivePointerAlignment: false
			
 
				 IndentWidth:     4
			
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -50,7 +50,8 @@ extern "C" {
 
				 /**
			
 
				  * @brief Union for SIMD access of q31/q15/q7 types
			
 
				  */
			
 
				-union arm_nnword {
			
 
				+union arm_nnword
			
 
				+{
			
 
				     q31_t word;
			
 
				     /**< q31 type */
			
 
				     q15_t half_words[2];
			
@@ -68,7 +69,8 @@ struct arm_nn_double
 
				     int32_t high;
			
 
				 };
			
 
				 
			
 
				-union arm_nn_long_long {
			
 
				+union arm_nn_long_long
			
 
				+{
			
 
				     int64_t long_long;
			
 
				     struct arm_nn_double word;
			
 
				 };
			
@@ -144,8 +146,8 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
 
				  * @return none.
			
 
				  *
			
 
				  * @details  This function does the q7 to q15 expansion with re-ordering of bytes. Re-ordering is a consequence of
			
 
				- *           the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its original
			
 
				- *           order.
			
 
				+ *           the sign extension intrinsic(DSP extension). The tail (i.e., last (N % 4) elements) retains its
			
 
				+ * original order.
			
 
				  *
			
 
				  */
			
 
				 void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset);
			
@@ -219,7 +221,7 @@ q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
 
				  *              2. NULL if implementation is not available.
			
 
				  *
			
 
				  * @details   Supported framework: TensorFlow Lite
			
 
				-*/
			
 
				+ */
			
 
				 q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
			
 
				                          const q7_t *input_col,
			
 
				                          const uint16_t output_ch,
			
@@ -251,7 +253,7 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
 
				  *          *output += row_base[i] * col_base[i]
			
 
				  *          sum_col += col_base[i]
			
 
				  *
			
 
				-*/
			
 
				+ */
			
 
				 arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
			
 
				                                      const int8_t *row_base,
			
 
				                                      const int8_t *col_base,
			
@@ -279,7 +281,7 @@ arm_status arm_nn_mat_mul_core_1x_s8(int32_t row_elements,
 
				  *                ..
			
 
				  *          output[3] += row_base[i + (row_elements * 3)] * col_base[i]
			
 
				  *          sum_col += col_base[i]
			
 
				-*/
			
 
				+ */
			
 
				 arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
			
 
				                                      const int32_t offset,
			
 
				                                      const int8_t *row_base,
			
@@ -288,34 +290,34 @@ arm_status arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
 
				                                      int32_t *const output);
			
 
				 
			
 
				 /**
			
 
				-* @brief General Matrix-multiplication function with per-channel requantization.
			
 
				-*        This function assumes:
			
 
				-*        - LHS input matrix NOT transposed (nt)
			
 
				-*        - RHS input matrix transposed (t)
			
 
				-*
			
 
				-*  @note This operation also performs the broadcast bias addition before the requantization
			
 
				-*
			
 
				-* @param[in]  lhs                Pointer to the LHS input matrix
			
 
				-* @param[in]  rhs                Pointer to the RHS input matrix
			
 
				-* @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of output
			
 
				-*                                columns (or RHS input rows)
			
 
				-* @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
			
 
				-* @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
			
 
				-*                                The length of this vector is equal to the number of output columns (or RHS input rows)
			
 
				-* @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length of
			
 
				-*                                this vector is equal to
			
 
				-*                                the number of output columns (or RHS input rows)
			
 
				-* @param[in]  lhs_rows           Number of LHS input rows
			
 
				-* @param[in]  rhs_rows           Number of RHS input rows
			
 
				-* @param[in]  rhs_cols           Number of LHS/RHS input columns
			
 
				-* @param[in]  lhs_offset         Offset to be applied to the LHS input value
			
 
				-* @param[in]  dst_offset         Offset to be applied the output result
			
 
				-* @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
			
 
				-* @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
			
 
				-*
			
 
				-* @return     The function returns <code>ARM_MATH_SUCCESS</code>
			
 
				-*
			
 
				-*/
			
 
				+ * @brief General Matrix-multiplication function with per-channel requantization.
			
 
				+ *        This function assumes:
			
 
				+ *        - LHS input matrix NOT transposed (nt)
			
 
				+ *        - RHS input matrix transposed (t)
			
 
				+ *
			
 
				+ *  @note This operation also performs the broadcast bias addition before the requantization
			
 
				+ *
			
 
				+ * @param[in]  lhs                Pointer to the LHS input matrix
			
 
				+ * @param[in]  rhs                Pointer to the RHS input matrix
			
 
				+ * @param[in]  bias               Pointer to the bias vector. The length of this vector is equal to the number of
			
 
				+ * output columns (or RHS input rows)
			
 
				+ * @param[out] dst                Pointer to the output matrix with "m" rows and "n" columns
			
 
				+ * @param[in]  dst_multipliers    Pointer to the multipliers vector needed for the per-channel requantization.
			
 
				+ *                                The length of this vector is equal to the number of output columns (or RHS input
			
 
				+ * rows)
			
 
				+ * @param[in]  dst_shifts         Pointer to the shifts vector needed for the per-channel requantization. The length
			
 
				+ * of this vector is equal to the number of output columns (or RHS input rows)
			
 
				+ * @param[in]  lhs_rows           Number of LHS input rows
			
 
				+ * @param[in]  rhs_rows           Number of RHS input rows
			
 
				+ * @param[in]  rhs_cols           Number of LHS/RHS input columns
			
 
				+ * @param[in]  lhs_offset         Offset to be applied to the LHS input value
			
 
				+ * @param[in]  dst_offset         Offset to be applied the output result
			
 
				+ * @param[in]  activation_min     Minimum value to clamp down the output. Range : int8
			
 
				+ * @param[in]  activation_max     Maximum value to clamp up the output. Range : int8
			
 
				+ *
			
 
				+ * @return     The function returns <code>ARM_MATH_SUCCESS</code>
			
 
				+ *
			
 
				+ */
			
 
				 arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
			
 
				                                    const q7_t *rhs,
			
 
				                                    const q31_t *bias,
			
@@ -387,8 +389,8 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
				  *                  - Updated output pointer if an implementaiton is available
			
 
				  *                  - NULL if no implementation is available.
			
 
				  *
			
 
				- * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
			
 
				- *                 for the following.
			
 
				+ * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
			
 
				+ * out for the following.
			
 
				  *                  - Output shift
			
 
				  *                  - Output multiplier
			
 
				  *                  - Output bias
			
@@ -428,8 +430,8 @@ q7_t *arm_nn_depthwise_conv_nt_t_padded_s8(const q7_t *lhs,
 
				  *                  - Updated output pointer if an implementaiton is available
			
 
				  *                  - NULL if no implementation is available.
			
 
				  *
			
 
				- * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
			
 
				- *                 for the following.
			
 
				+ * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
			
 
				+ * out for the following.
			
 
				  *                  - Output shift
			
 
				  *                  - Output multiplier
			
 
				  *                  - Output bias
			
@@ -519,8 +521,8 @@ __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t bloc
 
				                    "   vstrb.8                 q0, [%[in]], 16            \n"
			
 
				                    "   letp                    lr, 2b                     \n"
			
 
				                    "1:                                                    \n"
			
 
				-                   : [in] "+r"(dst)
			
 
				-                   : [cnt] "r"(block_size), [set_val] "r"(val)
			
 
				+                   : [ in ] "+r"(dst)
			
 
				+                   : [ cnt ] "r"(block_size), [ set_val ] "r"(val)
			
 
				                    : "q0", "memory", "r14");
			
 
				 #else
			
 
				     memset(dst, val, block_size);
			
@@ -779,8 +781,8 @@ __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__REST
 
				                    "   vstrb.8                 q0, [%[out]], 16           \n"
			
 
				                    "   letp                    lr, 2b                     \n"
			
 
				                    "1:                                                    \n"
			
 
				-                   : [in] "+r"(src), [out] "+r"(dst)
			
 
				-                   : [cnt] "r"(block_size)
			
 
				+                   : [ in ] "+r"(src), [ out ] "+r"(dst)
			
 
				+                   : [ cnt ] "r"(block_size)
			
 
				                    : "q0", "memory", "r14");
			
 
				 #else
			
 
				     memcpy(dst, src, block_size);
			
@@ -917,6 +919,20 @@ __STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val
 
				     return MUL_POW2(x, 1);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+  @brief         Write 2 q15 elements and post increment pointer.
			
 
				+  @param[in]     dest_q15  Pointer to pointer that holds address of destination.
			
 
				+  @param[in]     src_q31   Input value to be written.
			
 
				+  @return        none
			
 
				+ */
			
 
				+__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia(q15_t **dest_q15, q31_t src_q31)
			
 
				+{
			
 
				+    q31_t val = src_q31;
			
 
				+
			
 
				+    memcpy(*dest_q15, &val, 4);
			
 
				+    *dest_q15 += 2;
			
 
				+}
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
+++ b/CMSIS/NN/Source/ActivationFunctions/arm_relu_q15.c
@@ -74,7 +74,7 @@ void arm_relu_q15(q15_t *data, uint16_t size)
 
				         /* if MSB=1, mask will be 0xFF, 0x0 otherwise */
			
 
				         mask = __QSUB16(0x00000000, buf);
			
 
				 
			
 
				-        write_q15x2_ia(&output, in & (~mask));
			
 
				+        arm_nn_write_q15x2_ia(&output, in & (~mask));
			
 
				         i--;
			
 
				     }
			
 
				 
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
@@ -28,7 +28,6 @@
 
				  *
			
 
				  * -------------------------------------------------------------------- */
			
 
				 
			
 
				-#include "arm_math_memory.h"
			
 
				 #include "arm_nnfunctions.h"
			
 
				 #include "arm_nnsupportfunctions.h"
			
 
				 
			
@@ -63,10 +62,10 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
 
				 #endif
			
 
				 
			
 
				         in = arm_nn_read_q15x2(pCnt);
			
 
				-        write_q15x2_ia(&pCnt, __QADD16(vo1, in));
			
 
				+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo1, in));
			
 
				 
			
 
				         in = arm_nn_read_q15x2(pCnt);
			
 
				-        write_q15x2_ia(&pCnt, __QADD16(vo2, in));
			
 
				+        arm_nn_write_q15x2_ia(&pCnt, __QADD16(vo2, in));
			
 
				 
			
 
				         cnt--;
			
 
				     }
			
@@ -80,4 +79,4 @@ void arm_nn_accumulate_q7_to_q15(q15_t *pDst, const q7_t *pSrc, uint32_t length)
 
				 
			
 
				 /**
			
 
				  * @} end of NNBasicMath group
			
 
				- */
			
 
				+ */
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_no_shift.c
@@ -86,8 +86,8 @@ void arm_q7_to_q15_no_shift(const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
 
				         out1 = (int32_t)__PKHTB(in1, in2, 16);
			
 
				         out2 = (int32_t)__PKHBT(in2, in1, 16);
			
 
				 #endif
			
 
				-        write_q15x2_ia(&pDst, out1);
			
 
				-        write_q15x2_ia(&pDst, out2);
			
 
				+        arm_nn_write_q15x2_ia(&pDst, out1);
			
 
				+        arm_nn_write_q15x2_ia(&pDst, out2);
			
 
				 
			
 
				         /* Decrement the loop counter */
			
 
				         blkCnt--;
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_reordered_with_offset.c
@@ -71,8 +71,8 @@ void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t b
 
				         out_q15x2_1 = __SXTAB16(offset_q15x2, __ROR((uint32_t)in_q7x4, 8));
			
 
				         out_q15x2_2 = __SXTAB16(offset_q15x2, in_q7x4);
			
 
				 
			
 
				-        write_q15x2_ia(&dst, out_q15x2_2);
			
 
				-        write_q15x2_ia(&dst, out_q15x2_1);
			
 
				+        arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
			
 
				+        arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
			
 
				 
			
 
				         block_cnt--;
			
 
				     }
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c
@@ -86,8 +86,8 @@ void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size,
 
				         out_q15x2_2 = __PKHTB(in_q15x2_1, in_q15x2_2, 16);
			
 
				         out_q15x2_1 = __PKHBT(in_q15x2_2, in_q15x2_1, 16);
			
 
				 
			
 
				-        write_q15x2_ia(&dst, out_q15x2_1);
			
 
				-        write_q15x2_ia(&dst, out_q15x2_2);
			
 
				+        arm_nn_write_q15x2_ia(&dst, out_q15x2_1);
			
 
				+        arm_nn_write_q15x2_ia(&dst, out_q15x2_2);
			
 
				 
			
 
				         block_cnt--;
			
 
				     }