Browse Source

CMSIS-NN: remove output channel constraint on 1x1 conv

Constraint on output channel to be a multiple of 2 is removed
from arm_convolve_1x1_s8_fast() API

Change-Id: I3b0d9c7c966ab3aadf165367e703338744ae037e
Felix Johnny 6 years ago
parent
commit
084ca21287

+ 3 - 4
CMSIS/NN/Include/arm_nnfunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        25 November 2019
- * $Revision:    V.1.0.0
+ * $Date:        January 20, 2020
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -518,7 +518,6 @@ extern    "C"
    *   - Supported framework : TensorFlow Lite Micro
    *   - The following constrains on the arguments apply
    *      -# input_ch is a multiple of 4
-   *      -# output_ch is a multiple of 2
    *      -# padding equals 0
    *      -# Stride equals 1
    *      -# kernel dimension is 1x1 (Not provided in the argument list)

+ 3 - 3
CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_1x1_s8_fast.c

@@ -21,8 +21,8 @@
  * Title:        arm_convolve_1x1_s8_fast.c
  * Description:  Fast q7 version of 1x1 convolution (non-square shape)
  *
- * $Date:        January 15, 2020
- * $Revision:    V.1.0.1
+ * $Date:        January 20, 2020
+ * $Revision:    V.1.0.2
  *
  * Target Processor:  Cortex-M cores
  *
@@ -72,7 +72,7 @@ arm_status arm_convolve_1x1_s8_fast(const q7_t *input,
                                     const uint16_t output_y,
                                     q15_t *buffer_a)
 {
-    if (input_ch % 4 != 0 || output_ch % 2 != 0 ||
+    if (input_ch % 4 != 0 ||
         pad_x != 0 || pad_y != 0 ||
         stride_x != 1 || stride_y != 1)
     {

+ 48 - 3
CMSIS/NN/Source/ConvolutionFunctions/arm_nn_mat_mult_kernel_s8_s16_reordered.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nn_mat_mult_kernel_s8_s16_reordered.c
  * Description:  Matrix-multiplication function for convolution with reordered columns
  *
- * $Date:        August 2019
- * $Revision:    V.1.0.0
+ * $Date:        January 20, 2020
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M cores
  * -------------------------------------------------------------------- */
@@ -134,6 +134,51 @@ q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
         ip_a0 += num_col_a;
         row_count--;
     }
+
+    if (output_ch & 1)
+    {
+        /* setup pointers for B */
+        const q15_t *ip_b0 = input_b;
+        const q15_t *ip_b1 = ip_b0 + num_col_a;
+
+        /* Init accumulator with bias for channel N + 1 */
+        q31_t ch_0_out_0 = *bias;
+        q31_t ch_0_out_1 = ch_0_out_0;
+
+        int32_t col_count = num_col_a / 4;
+        while (col_count)
+        {
+            q31_t a01, a02;
+            q31_t b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            q31_t b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+            ip_a0 = read_and_pad_reordered(ip_a0, &a01, &a02);
+
+            ch_0_out_0 = __SMLAD(a01, b0, ch_0_out_0);
+            ch_0_out_1 = __SMLAD(a01, b1, ch_0_out_1);
+
+            b0 = arm_nn_read_q15x2_ia(&ip_b0);
+            b1 = arm_nn_read_q15x2_ia(&ip_b1);
+
+            ch_0_out_0 = __SMLAD(a02, b0, ch_0_out_0);
+            ch_0_out_1 = __SMLAD(a02, b1, ch_0_out_1);
+
+            col_count--;
+        } /* while over col_count */
+
+        ch_0_out_0 = arm_nn_requantize(ch_0_out_0, *out_mult, *out_shift);
+        ch_0_out_0 += out_offset;
+        ch_0_out_0 = MAX(ch_0_out_0, activation_min);
+        ch_0_out_0 = MIN(ch_0_out_0, activation_max);
+        *out_0++ = (q7_t)ch_0_out_0;
+
+        ch_0_out_1 = arm_nn_requantize(ch_0_out_1, *out_mult, *out_shift);
+        ch_0_out_1 += out_offset;
+        ch_0_out_1 = MAX(ch_0_out_1, activation_min);
+        ch_0_out_1 = MIN(ch_0_out_1, activation_max);
+        *out_1++ = (q7_t)ch_0_out_1;
+    }
+
     out_0 += output_ch;
 
     /* return the new output pointer with offset */