6 лет назад · 11bd24871a
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -167,6 +167,37 @@ void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t b
 
				  */
			
 
				 void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
			
 
				 
			
 
				+/**
			
 
				+ * @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
			
 
				+ * @param[in]    row     pointer to row
			
 
				+ * @param[in]    col     pointer to im2col buffer, always consists of 2 columns.
			
 
				+ * @param[in]    num_ch   number of channels
			
 
				+ * @param[in]    out_shift  pointer to per output channel requantization shift parameter.
			
 
				+ * @param[in]    out_mult   pointer to per output channel requantization multiplier parameter.
			
 
				+ * @param[in]    out_offset      output tensor offset.
			
 
				+ * @param[in]    activation_min   minimum value to clamp the output to. Range : int8
			
 
				+ * @param[in]    activation_max   maximum value to clamp the output to. Range : int8
			
 
				+ * @param[in]    kernel_size   number of elements in one column.
			
 
				+ * @param[in]    output_bias per output channel bias. Range : int32
			
 
				+ * @param[out]   out_0       pointer to output
			
 
				+ * @return     The function returns one of the two
			
 
				+ *              1. The incremented output pointer for a successful operation or
			
 
				+ *              2. NULL if implementation is not available.
			
 
				+ *
			
 
				+ * @details     Supported framework: TensorFlow Lite micro.
			
 
				+ */
			
 
				+q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
			
 
				+                                    const q15_t *col,
			
 
				+                                    const uint16_t num_ch,
			
 
				+                                    const int32_t *out_shift,
			
 
				+                                    const int32_t *out_mult,
			
 
				+                                    const int32_t out_offset,
			
 
				+                                    const int32_t activation_min,
			
 
				+                                    const int32_t activation_max,
			
 
				+                                    const uint16_t kernel_size,
			
 
				+                                    const int32_t *const output_bias,
			
 
				+                                    q7_t *out);
			
 
				+
			
 
				 /**
			
 
				   @brief         Read 2 q15 elements and post increment pointer.
			
 
				   @param[in]     in_q15   Pointer to pointer that holds address of input.
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
@@ -42,7 +42,7 @@
 
				  * @{
			
 
				  */
			
 
				 
			
 
				-  /*
			
 
				+/*
			
 
				    * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
			
 
				    *
			
 
				    *  Refer prototype header file for details.
			
@@ -75,20 +75,114 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
 
				                                      q15_t *buffer_a)
			
 
				 {
			
 
				 
			
 
				-
			
 
				     /* Check input constraints input_ch == output_ch */
			
 
				     if (input_ch != output_ch)
			
 
				     {
			
 
				         return ARM_MATH_SIZE_MISMATCH;
			
 
				     }
			
 
				+#ifdef ARM_MATH_MVEI
			
 
				+    (void)dilation_x;
			
 
				+    (void)dilation_y;
			
 
				+
			
 
				+    /* Generate two columns from the input tensor */
			
 
				+    q15_t *two_column_buf = buffer_a;
			
 
				+    q7_t *out = output;
			
 
				+
			
 
				+    /* This part implements the im2col function */
			
 
				+    for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
			
 
				+    {
			
 
				+        const int32_t base_idx_y = i_out_y * stride_y - pad_y;
			
 
				+        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
			
 
				+        {
			
 
				+            const int32_t base_idx_x = (i_out_x * stride_x) - pad_x;
			
 
				+            for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
			
 
				+            {
			
 
				+                for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
			
 
				+                {
			
 
				+                    if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
			
 
				+                    {
			
 
				+                        /* Filling 0 for out-of-bound paddings */
			
 
				+                        memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
			
 
				+                    }
			
 
				+                    else
			
 
				+                    {
			
 
				+                        /* Copying the pixel data to column */
			
 
				+                        arm_q7_to_q15_with_offset(input + (i_ker_y * input_x + i_ker_x) * input_ch, two_column_buf, input_ch, input_offset);
			
 
				+                    }
			
 
				+                    two_column_buf += input_ch;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            /* Computation is filed for every 2 columns */
			
 
				+            if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
			
 
				+            {
			
 
				+                two_column_buf = buffer_a;
			
 
				+                out = arm_nn_depthwise_conv_s8_core(kernel,
			
 
				+                                                    buffer_a,
			
 
				+                                                    output_ch,
			
 
				+                                                    output_shift,
			
 
				+                                                    output_mult,
			
 
				+                                                    output_offset,
			
 
				+                                                    output_activation_min,
			
 
				+                                                    output_activation_max,
			
 
				+                                                    kernel_x * kernel_y,
			
 
				+                                                    bias,
			
 
				+                                                    out);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    /* left-over pixels */
			
 
				+    if (two_column_buf != buffer_a)
			
 
				+    {
			
 
				+        int32_t ch_count = (output_ch + 3) / 4;
			
 
				+        const int32_t *out_bias = bias;
			
 
				 
			
 
				-#if defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
			
 
				+        int32_t idx = 0;
			
 
				+        int32_t out_ch = output_ch;
			
 
				+        while (ch_count > 0)
			
 
				+        {
			
 
				+            int32_t ker_count = kernel_x * kernel_y;
			
 
				+
			
 
				+            const int32_t offset = idx * 4;
			
 
				+            const int8_t *row = kernel + offset;
			
 
				+            int16_t *col = buffer_a + offset;
			
 
				+            mve_pred16_t p = vctp32q(out_ch);
			
 
				+
			
 
				+            int32x4_t res = vldrwq_z_s32(out_bias, p);
			
 
				+            out_bias += 4;
			
 
				+
			
 
				+            while (ker_count > 0)
			
 
				+            {
			
 
				+                const int32x4_t ip = vldrhq_z_s32(col, p);
			
 
				+                const int32x4_t ker = vldrbq_z_s32(row, p);
			
 
				+                col += output_ch;
			
 
				+                row += output_ch;
			
 
				+                res += vmlasq_n_s32(ip, ker, 0);
			
 
				+                ker_count--;
			
 
				+            }
			
 
				+
			
 
				+            int32x4_t mult = vldrwq_z_s32(output_mult, p);
			
 
				+            int32x4_t shift = vldrwq_z_s32(output_shift, p);
			
 
				+            output_mult += 4;
			
 
				+            output_shift += 4;
			
 
				+            res = arm_mve_requantize_32x4(res, mult, shift);
			
 
				+
			
 
				+            res = vaddq_n_s32(res, output_offset);
			
 
				+            res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
			
 
				+            res = vminq_s32(res, vdupq_n_s32(output_activation_max));
			
 
				+            vstrbq_p_s32(out, res, p);
			
 
				+            out += 4;
			
 
				+            idx++;
			
 
				+            out_ch -= 4;
			
 
				+            ch_count--;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+#elif defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
			
 
				     /* Run the following code in cores using DSP extension */
			
 
				     (void)dilation_x;
			
 
				     (void)dilation_y;
			
 
				-
			
 
				-    int16_t i_out_y, i_out_x;
			
 
				-    int16_t i_ker_y, i_ker_x;
			
 
				     q15_t *const col_buffer_start = buffer_a;
			
 
				     q15_t *col_buffer = col_buffer_start;
			
 
				     const int32_t *const bias_start_pos = bias;
			
@@ -97,11 +191,10 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
 
				     uint16_t row_count;
			
 
				     uint16_t row_shift;
			
 
				 
			
 
				-
			
 
				-    for (i_out_y = 0; i_out_y < output_y; i_out_y++)
			
 
				+    for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
			
 
				     {
			
 
				         const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
			
 
				-        for (i_out_x = 0; i_out_x < output_x; i_out_x++)
			
 
				+        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
			
 
				         {
			
 
				             const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
			
 
				 
			
@@ -118,11 +211,11 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
 
				                 index += (kernel_x * input_ch) * ker_y_start;
			
 
				             }
			
 
				 
			
 
				-            for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
			
 
				+            for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
			
 
				             {
			
 
				                 const int32_t idx_y = base_idx_y + i_ker_y;
			
 
				 
			
 
				-                for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
			
 
				+                for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
			
 
				                 {
			
 
				                     const int32_t idx_x = base_idx_x + i_ker_x;
			
 
				                     if (idx_x < 0 || idx_x >= input_x)
			
@@ -151,7 +244,7 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
 
				 
			
 
				             while (row_count)
			
 
				             {
			
 
				-                q31_t sum =   *bias++;
			
 
				+                q31_t sum = *bias++;
			
 
				                 q31_t sum_2 = *bias++;
			
 
				                 q31_t sum_3 = *bias++;
			
 
				                 q31_t sum_4 = *bias++;
			
@@ -306,7 +399,7 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
 
				                                  dilation_x,
			
 
				                                  dilation_y,
			
 
				                                  NULL);
			
 
				-#endif /* ARM_MATH_DSP & ARM_MATH_LOOPUNROLL*/
			
 
				+#endif /* ARM_MATH_MVEI | (ARM_MATH_DSP & ARM_MATH_LOOPUNROLL) */
			
 
				 
			
 
				     /* Return to application */
			
 
				     return ARM_MATH_SUCCESS;
			
@@ -316,8 +409,10 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const uint16_t input_ch,
 
				                                                   const uint16_t kernel_x,
			
 
				                                                   const uint16_t kernel_y)
			
 
				 {
			
 
				-#if defined(ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
			
 
				+#if defined(ARM_MATH_MVEI)
			
 
				     return (2 * input_ch * kernel_x * kernel_y) * sizeof(int16_t);
			
 
				+#elif defined(ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
			
 
				+    return (input_ch * kernel_x * kernel_y) * sizeof(int16_t);
			
 
				 #else
			
 
				     (void)input_ch;
			
 
				     (void)kernel_x;
			
@@ -329,4 +424,3 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const uint16_t input_ch,
 
				 /**
			
 
				  * @} end of NNConv group
			
 
				  */
			
 
				-
			
--- a/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
+++ b/CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c
@@ -0,0 +1,219 @@
 
				+/*
			
 
				+ * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the License); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
			
 
				+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/* ----------------------------------------------------------------------
			
 
				+ * Project:      CMSIS NN Library
			
 
				+ * Title:        arm_nn_depthwise_conv_s8_core.c
			
 
				+ * Description:  Depthwise convolution on im2col buffers.
			
 
				+ *
			
 
				+ * $Date:        November 2019
			
 
				+ * $Revision:    V.1.0.0
			
 
				+ *
			
 
				+ * Target Processor:  Cortex-M cores
			
 
				+ * -------------------------------------------------------------------- */
			
 
				+
			
 
				+#include "arm_math.h"
			
 
				+#include "arm_nnfunctions.h"
			
 
				+
			
 
				+/*
			
 
				+   * Depthwise conv on an im2col buffer where the input channel equals
			
 
				+   * output channel.
			
 
				+   *
			
 
				+   * Refer header file for details.
			
 
				+   *
			
 
				+   */
			
 
				+
			
 
				+q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
			
 
				+                                    const q15_t *col,
			
 
				+                                    const uint16_t num_ch,
			
 
				+                                    const int32_t *out_shift,
			
 
				+                                    const int32_t *out_mult,
			
 
				+                                    const int32_t out_offset,
			
 
				+                                    const int32_t activation_min,
			
 
				+                                    const int32_t activation_max,
			
 
				+                                    const uint16_t kernel_size,
			
 
				+                                    const int32_t *const output_bias,
			
 
				+                                    q7_t *out)
			
 
				+{
			
 
				+#if defined(ARM_MATH_MVEI)
			
 
				+    int32_t ch_per_loop = num_ch / 4;
			
 
				+
			
 
				+    const int32_t *bias = output_bias;
			
 
				+    int8_t *out_tmp = out;
			
 
				+
			
 
				+    int32_t idx = 0;
			
 
				+
			
 
				+    while (ch_per_loop > 0)
			
 
				+    {
			
 
				+        int32x4_t ip_0;
			
 
				+        int32x4_t ip_1;
			
 
				+        int32_t ker_loop = kernel_size / 3;
			
 
				+        int32x4_t out_0 = vldrwq_s32(bias);
			
 
				+        int32x4_t out_1 = out_0;
			
 
				+        bias += 4;
			
 
				+
			
 
				+        const int32_t offset = idx * 4;
			
 
				+        const int8_t *row_0 = row + offset;
			
 
				+        const int16_t *col_0 = col + offset;
			
 
				+        const int16_t *col_1 = col + kernel_size * num_ch;
			
 
				+
			
 
				+        int32x4_t ker_0 = vldrbq_s32(row_0);
			
 
				+
			
 
				+        while (ker_loop > 0)
			
 
				+        {
			
 
				+            const int8_t *row_1 = row_0 + num_ch;
			
 
				+            const int8_t *row_2 = row_0 + 2 * num_ch;
			
 
				+            const int32x4_t ker_1 = vldrbq_s32(row_1);
			
 
				+            const int32x4_t ker_2 = vldrbq_s32(row_2);
			
 
				+
			
 
				+            ip_0 = vldrhq_s32(col_0);
			
 
				+            ip_1 = vldrhq_s32(col_1);
			
 
				+            col_0 += num_ch;
			
 
				+            col_1 += num_ch;
			
 
				+
			
 
				+            out_0 += vmulq_s32(ip_0, ker_0);
			
 
				+            out_1 += vmulq_s32(ip_1, ker_0);
			
 
				+
			
 
				+            ip_0 = vldrhq_s32(col_0);
			
 
				+            ip_1 = vldrhq_s32(col_1);
			
 
				+            col_0 += num_ch;
			
 
				+            col_1 += num_ch;
			
 
				+
			
 
				+            out_0 += vmulq_s32(ip_0, ker_1);
			
 
				+            out_1 += vmulq_s32(ip_1, ker_1);
			
 
				+
			
 
				+            ip_0 = vldrhq_s32(col_0);
			
 
				+            ip_1 = vldrhq_s32(col_1);
			
 
				+            col_0 += num_ch;
			
 
				+            col_1 += num_ch;
			
 
				+
			
 
				+            out_0 += vmulq_s32(ip_0, ker_2);
			
 
				+            out_1 += vmulq_s32(ip_1, ker_2);
			
 
				+            row_0 += 3 * num_ch;
			
 
				+
			
 
				+            ker_0 = vldrbq_s32(row_0);
			
 
				+            ker_loop--;
			
 
				+        }
			
 
				+
			
 
				+        idx++;
			
 
				+        /* Handle tail kernel elements */
			
 
				+        ker_loop = kernel_size - ((kernel_size / 3) * 3);
			
 
				+        while (ker_loop > 0)
			
 
				+        {
			
 
				+            ip_0 = vldrhq_s32(col_0);
			
 
				+            ip_1 = vldrhq_s32(col_1);
			
 
				+
			
 
				+            out_0 += vmulq_s32(ip_0, ker_0);
			
 
				+            out_1 += vmulq_s32(ip_1, ker_0);
			
 
				+
			
 
				+            col_0 += num_ch;
			
 
				+            col_1 += num_ch;
			
 
				+
			
 
				+            ip_0 = vldrhq_s32(col_0);
			
 
				+            ip_1 = vldrhq_s32(col_1);
			
 
				+
			
 
				+            row_0 += num_ch;
			
 
				+            ker_0 = vldrbq_s32(row_0);
			
 
				+            ker_loop--;
			
 
				+        }
			
 
				+        const int32x4_t mult = vldrwq_s32(out_mult);
			
 
				+        const int32x4_t shift = vldrwq_s32(out_shift);
			
 
				+        out_mult += 4;
			
 
				+        out_shift += 4;
			
 
				+
			
 
				+        out_0 = arm_mve_requantize_32x4(out_0, mult, shift);
			
 
				+        out_1 = arm_mve_requantize_32x4(out_1, mult, shift);
			
 
				+
			
 
				+        out_0 = vaddq_n_s32(out_0, out_offset);
			
 
				+        out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
			
 
				+        out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
			
 
				+        vstrbq_s32(out_tmp, out_0);
			
 
				+
			
 
				+        out_1 = vaddq_n_s32(out_1, out_offset);
			
 
				+        out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
			
 
				+        out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
			
 
				+        vstrbq_s32(out_tmp + num_ch, out_1);
			
 
				+
			
 
				+        out_tmp += 4;
			
 
				+        ch_per_loop--;
			
 
				+    }
			
 
				+
			
 
				+    int32_t tail_ch = num_ch & 3;
			
 
				+    if (tail_ch != 0)
			
 
				+    {
			
 
				+        int32_t ch_idx = (num_ch & ~3);
			
 
				+        int32x4_t col_0_sum;
			
 
				+        int32x4_t col_1_sum;
			
 
				+
			
 
				+        const int32_t single_buffer_size = kernel_size * num_ch;
			
 
				+        for (int i = 0; i < tail_ch; i++)
			
 
				+        {
			
 
				+            const int16_t *col_pos_0 = col + ch_idx;
			
 
				+            const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
			
 
				+
			
 
				+            const int8_t *row_pos_0 = row + ch_idx;
			
 
				+            const int8_t *row_pos_1 = row_pos_0;
			
 
				+            int32_t sum_0 = bias[i];
			
 
				+            int32_t sum_1 = bias[i];
			
 
				+
			
 
				+            for (int i = 0; i < kernel_size; i++)
			
 
				+            {
			
 
				+                sum_0 += row_pos_0[i * num_ch] * col_pos_0[i * num_ch];
			
 
				+                sum_1 += row_pos_1[i * num_ch] * col_pos_1[i * num_ch];
			
 
				+            }
			
 
				+            col_0_sum[i] = sum_0;
			
 
				+            col_1_sum[i] = sum_1;
			
 
				+
			
 
				+            ch_idx++;
			
 
				+        }
			
 
				+        const mve_pred16_t p = vctp32q(tail_ch);
			
 
				+        const int32x4_t mult = vldrwq_z_s32(out_mult, p);
			
 
				+        const int32x4_t shift = vldrwq_z_s32(out_shift, p);
			
 
				+
			
 
				+        col_0_sum = arm_mve_requantize_32x4(col_0_sum, mult, shift);
			
 
				+        col_1_sum = arm_mve_requantize_32x4(col_1_sum, mult, shift);
			
 
				+
			
 
				+        col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
			
 
				+        col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
			
 
				+        col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
			
 
				+        vstrbq_p_s32(out_tmp, col_0_sum, p);
			
 
				+
			
 
				+        col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
			
 
				+        col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
			
 
				+        col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
			
 
				+        vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
			
 
				+
			
 
				+        out_tmp += tail_ch;
			
 
				+    }
			
 
				+
			
 
				+    return out_tmp + num_ch;
			
 
				+#else
			
 
				+    (void)row;
			
 
				+    (void)col;
			
 
				+    (void)num_ch;
			
 
				+    (void)out_shift;
			
 
				+    (void)out_mult;
			
 
				+    (void)out_offset;
			
 
				+    (void)activation_min;
			
 
				+    (void)activation_max;
			
 
				+    (void)kernel_size;
			
 
				+    (void)output_bias;
			
 
				+    (void)out;
			
 
				+    return NULL;
			
 
				+#endif
			
 
				+}