Просмотр исходного кода

CMSIS-NN : Update weight offset as zero in FC

Fully Connected implementation is updated to reflect
zero weight offset as per int8 spec.

MVE version and non-DSP extension versions are updated

Change-Id: Ia7d96d596169abd016d1e2d2a0eb5491b6dd28ed
Felix Thomasmathibalan 5 лет назад
Родитель
Сommit
e2c03d033f

+ 4 - 4
Include/arm_nn_types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,8 +22,8 @@
  * Description:  Public header file to contain the CMSIS-NN structs for the
  *               TensorFlowLite micro compliant functions
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.0.0
+ * $Date:        19. March 2021
+ * $Revision:    V.2.0.0
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -112,7 +112,7 @@ typedef struct
 typedef struct
 {
     int32_t input_offset;  /**< Zero value for the input tensor */
-    int32_t filter_offset; /**< Zero value for the filter tensor */
+    int32_t filter_offset; /**< Zero value for the filter tensor. Not used */
     int32_t output_offset; /**< Zero value for the output tensor */
     cmsis_nn_activation activation;
 } cmsis_nn_fc_params;

Разница между файлами не показана из-за своего большого размера
+ 864 - 477
Include/arm_nnfunctions.h


+ 4 - 5
Include/arm_nnsupportfunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        09. October 2020
- * $Revision:    V.4.5.5
+ * $Date:        19. March 2021
+ * $Revision:    V.5.0.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -341,8 +341,7 @@ arm_status arm_nn_mat_mult_nt_t_s8(const q7_t *lhs,
  * @param[out]     dst             Output vector
  * @param[in]      lhs_offset      Offset to be added to the input values of the left-hand side vector.
  *                                 Range: -127 to 128
- * @param[in]      rhs_offset      Offset to be added to the input values of the right-hand side matrix.
- *                                 Range: -127 to 128
+ * @param[in]      rhs_offset      Not used
  * @param[in]      dst_offset      Offset to be added to the output values. Range: -127 to 128
  * @param[in]      dst_multiplier  Output multiplier
  * @param[in]      dst_shift       Output shift

+ 6 - 4
Source/FullyConnectedFunctions/arm_fully_connected_s8.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_fully_connected_s8
  * Description:  Fully connected function compatible with TF Lite.
  *
- * $Date:        09. October 2020
- * $Revision:    V.2.0.1
+ * $Date:        19. March 2021
+ * $Revision:    V.3.0.0
  *
  * Target Processor:  Cortex-M and Cortex-A cores
  *
@@ -61,6 +61,8 @@ arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
 {
     (void)bias_dims;
     (void)ctx;
+    (void)fc_params->filter_offset;
+
     int32_t batch_cnt = input_dims->n;
 
     while (batch_cnt)
@@ -70,7 +72,7 @@ arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                  bias,
                                  output,
                                  fc_params->input_offset,
-                                 fc_params->filter_offset,
+                                 0,
                                  fc_params->output_offset,
                                  quant_params->multiplier,
                                  quant_params->shift,

+ 35 - 38
Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_vec_mat_mult_t_s8
  * Description:  s8 vector by matrix (transposed) multiplication
  *
- * $Date:        09. October 2020
- * $Revision:    V.1.5.1
+ * $Date:        19. March 2021
+ * $Revision:    V.2.0.0
  *
  * Target Processor:  Cortex-M
  *
@@ -59,25 +59,10 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
                                     const int32_t activation_min,
                                     const int32_t activation_max)
 {
+    (void)rhs_offset;
 #if defined(ARM_MATH_MVEI)
     int32_t row_loop_cnt = rhs_rows / 3;
 
-    int32_t lhs_sum = 0;
-    {
-        const int32_t col_loop_cnt = (rhs_cols + 15) / 16;
-        uint32_t col_cnt = (uint32_t)rhs_cols;
-        const int8_t *lhs_vec = lhs;
-        for (int i = 0; i < col_loop_cnt; i++)
-        {
-            mve_pred16_t p = vctp8q(col_cnt);
-            col_cnt -= 16;
-
-            const int8x16_t input = vldrbq_z_s8(lhs_vec, p);
-            lhs_sum = vaddvaq_p_s8(lhs_sum, input, p);
-            lhs_vec += 16;
-        }
-    }
-
     for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
     {
         int32_t acc_0 = 0;
@@ -132,10 +117,7 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
             bias += 3;
         }
         const int32x4_t rhs_sum = {rhs_sum_0, rhs_sum_1, rhs_sum_2, 0};
-
         acc += vdupq_n_s32(lhs_offset) * rhs_sum;
-        acc += vdupq_n_s32(rhs_offset * lhs_sum);
-        acc += vdupq_n_s32(lhs_offset * rhs_offset * rhs_cols);
 
         acc = arm_requantize_mve(acc, dst_multiplier, dst_shift);
         acc = vaddq_s32(acc, vdupq_n_s32(dst_offset));
@@ -175,8 +157,7 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
             acc_0 += *bias;
             bias++;
         }
-        const int32_t offsets =
-            (rhs_sum_0 * lhs_offset) + (lhs_sum * rhs_offset) + (lhs_offset * rhs_offset * rhs_cols);
+        const int32_t offsets = rhs_sum_0 * lhs_offset;
         acc_0 += offsets;
         acc_0 = arm_nn_requantize(acc_0, dst_multiplier, dst_shift);
         acc_0 += dst_offset;
@@ -410,53 +391,68 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
 #else
 
-    for (int32_t rhs_rows_idx = 0; rhs_rows_idx <= (rhs_rows - 2); rhs_rows_idx += 2)
+    int32_t row_loop_cnt = rhs_rows / 3;
+
+    for (int i_row_loop_cnt = 0; i_row_loop_cnt < row_loop_cnt; i_row_loop_cnt++)
     {
-        const q7_t *lhs_ptr = &lhs[0];
-        const q7_t *rhs_ptr = &rhs[0];
+        const q7_t *lhs_ptr = lhs;
+        const q7_t *rhs_ptr_0 = &rhs[0];
+        const q7_t *rhs_ptr_1 = &rhs[rhs_cols];
+        const q7_t *rhs_ptr_2 = &rhs[rhs_cols * 2];
 
         q31_t res00 = 0;
         q31_t res01 = 0;
+        q31_t res02 = 0;
         if (bias)
         {
             res00 = *bias++;
             res01 = *bias++;
+            res02 = *bias++;
         }
-
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            q31_t rhs_value0 = rhs_ptr[0] + rhs_offset;
-            q31_t rhs_value1 = rhs_ptr[rhs_cols] + rhs_offset;
-            q31_t lhs_value = lhs_ptr[0] + lhs_offset;
+            const q31_t rhs_value0 = (int8_t)*rhs_ptr_0;
+            const q31_t rhs_value1 = (int8_t)*rhs_ptr_1;
+            const q31_t rhs_value2 = (int8_t)*rhs_ptr_2;
+            const q31_t lhs_value = (int8_t)*lhs_ptr + lhs_offset;
 
             res00 += lhs_value * rhs_value0;
             res01 += lhs_value * rhs_value1;
+            res02 += lhs_value * rhs_value2;
 
-            ++rhs_ptr;
+            ++rhs_ptr_0;
+            ++rhs_ptr_1;
+            ++rhs_ptr_2;
             ++lhs_ptr;
         }
-
         // Quantize down
         res00 = arm_nn_requantize(res00, dst_multiplier, dst_shift);
         res01 = arm_nn_requantize(res01, dst_multiplier, dst_shift);
+        res02 = arm_nn_requantize(res02, dst_multiplier, dst_shift);
 
         // Add offset
         res00 += dst_offset;
         res01 += dst_offset;
+        res02 += dst_offset;
 
         // Clamp the result
         res00 = MAX(res00, activation_min);
         res00 = MIN(res00, activation_max);
         res01 = MAX(res01, activation_min);
         res01 = MIN(res01, activation_max);
+        res02 = MAX(res02, activation_min);
+        res02 = MIN(res02, activation_max);
 
         *dst++ = (q7_t)res00;
         *dst++ = (q7_t)res01;
+        *dst++ = (q7_t)res02;
 
-        rhs += 2 * rhs_cols;
+        rhs += 3 * rhs_cols;
     }
 
-    if (rhs_rows % 2)
+    const int loop_cnt = rhs_rows % 3;
+
+    for (int i_loop_cnt = 0; i_loop_cnt < loop_cnt; i_loop_cnt++)
     {
         const q7_t *lhs_ptr = &lhs[0];
         const q7_t *rhs_ptr = &rhs[0];
@@ -469,8 +465,8 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
 
         for (int32_t rhs_cols_idx = 0; rhs_cols_idx < rhs_cols; ++rhs_cols_idx)
         {
-            q31_t rhs_value0 = rhs_ptr[0] + rhs_offset;
-            q31_t lhs_value = lhs_ptr[0] + lhs_offset;
+            q31_t rhs_value0 = (int8_t)rhs_ptr[0] + rhs_offset;
+            q31_t lhs_value = (int8_t)lhs_ptr[0] + lhs_offset;
 
             res00 += lhs_value * rhs_value0;
 
@@ -488,7 +484,8 @@ arm_status arm_nn_vec_mat_mult_t_s8(const q7_t *lhs,
         res00 = MAX(res00, activation_min);
         res00 = MIN(res00, activation_max);
 
-        *dst = (q7_t)res00;
+        *dst++ = (q7_t)res00;
+        rhs += rhs_cols;
     }
 #endif
 

Некоторые файлы не были показаны из-за большого количества измененных файлов