Просмотр исходного кода

CMSIS-NN : MVE support for s8 depthwise conv

Felix Johnny 6 лет назад
Родитель
Сommit
11bd24871a

+ 31 - 0
CMSIS/NN/Include/arm_nnsupportfunctions.h

@@ -167,6 +167,37 @@ void arm_q7_to_q15_reordered_with_offset(const q7_t *src, q15_t *dst, uint32_t b
  */
 void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
 
+/**
+ * @brief Depthwise conv on an im2col buffer where the input channel equals output channel.
+ * @param[in]    row     pointer to row
+ * @param[in]    col     pointer to im2col buffer, always consists of 2 columns.
+ * @param[in]    num_ch   number of channels
+ * @param[in]    out_shift  pointer to per output channel requantization shift parameter.
+ * @param[in]    out_mult   pointer to per output channel requantization multiplier parameter.
+ * @param[in]    out_offset      output tensor offset.
+ * @param[in]    activation_min   minimum value to clamp the output to. Range : int8
+ * @param[in]    activation_max   maximum value to clamp the output to. Range : int8
+ * @param[in]    kernel_size   number of elements in one column.
+ * @param[in]    output_bias per output channel bias. Range : int32
+ * @param[out]   out_0       pointer to output
+ * @return     The function returns one of the two
+ *              1. The incremented output pointer for a successful operation or
+ *              2. NULL if implementation is not available.
+ *
+ * @details     Supported framework: TensorFlow Lite micro.
+ */
+q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
+                                    const q15_t *col,
+                                    const uint16_t num_ch,
+                                    const int32_t *out_shift,
+                                    const int32_t *out_mult,
+                                    const int32_t out_offset,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const uint16_t kernel_size,
+                                    const int32_t *const output_bias,
+                                    q7_t *out);
+
 /**
   @brief         Read 2 q15 elements and post increment pointer.
   @param[in]     in_q15   Pointer to pointer that holds address of input.

+ 109 - 15
CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c

@@ -42,7 +42,7 @@
  * @{
  */
 
-  /*
+/*
    * Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel
    *
    *  Refer prototype header file for details.
@@ -75,20 +75,114 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
                                      q15_t *buffer_a)
 {
 
-
     /* Check input constraints input_ch == output_ch */
     if (input_ch != output_ch)
     {
         return ARM_MATH_SIZE_MISMATCH;
     }
+#ifdef ARM_MATH_MVEI
+    (void)dilation_x;
+    (void)dilation_y;
+
+    /* Generate two columns from the input tensor */
+    q15_t *two_column_buf = buffer_a;
+    q7_t *out = output;
+
+    /* This part implements the im2col function */
+    for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
+    {
+        const int32_t base_idx_y = i_out_y * stride_y - pad_y;
+        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
+        {
+            const int32_t base_idx_x = (i_out_x * stride_x) - pad_x;
+            for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
+            {
+                for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
+                {
+                    if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
+                    {
+                        /* Filling 0 for out-of-bound paddings */
+                        memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
+                    }
+                    else
+                    {
+                        /* Copying the pixel data to column */
+                        arm_q7_to_q15_with_offset(input + (i_ker_y * input_x + i_ker_x) * input_ch, two_column_buf, input_ch, input_offset);
+                    }
+                    two_column_buf += input_ch;
+                }
+            }
+
+            /* Computation is filed for every 2 columns */
+            if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
+            {
+                two_column_buf = buffer_a;
+                out = arm_nn_depthwise_conv_s8_core(kernel,
+                                                    buffer_a,
+                                                    output_ch,
+                                                    output_shift,
+                                                    output_mult,
+                                                    output_offset,
+                                                    output_activation_min,
+                                                    output_activation_max,
+                                                    kernel_x * kernel_y,
+                                                    bias,
+                                                    out);
+            }
+        }
+    }
+
+    /* left-over pixels */
+    if (two_column_buf != buffer_a)
+    {
+        int32_t ch_count = (output_ch + 3) / 4;
+        const int32_t *out_bias = bias;
 
-#if defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
+        int32_t idx = 0;
+        int32_t out_ch = output_ch;
+        while (ch_count > 0)
+        {
+            int32_t ker_count = kernel_x * kernel_y;
+
+            const int32_t offset = idx * 4;
+            const int8_t *row = kernel + offset;
+            int16_t *col = buffer_a + offset;
+            mve_pred16_t p = vctp32q(out_ch);
+
+            int32x4_t res = vldrwq_z_s32(out_bias, p);
+            out_bias += 4;
+
+            while (ker_count > 0)
+            {
+                const int32x4_t ip = vldrhq_z_s32(col, p);
+                const int32x4_t ker = vldrbq_z_s32(row, p);
+                col += output_ch;
+                row += output_ch;
+                res += vmlasq_n_s32(ip, ker, 0);
+                ker_count--;
+            }
+
+            int32x4_t mult = vldrwq_z_s32(output_mult, p);
+            int32x4_t shift = vldrwq_z_s32(output_shift, p);
+            output_mult += 4;
+            output_shift += 4;
+            res = arm_mve_requantize_32x4(res, mult, shift);
+
+            res = vaddq_n_s32(res, output_offset);
+            res = vmaxq_s32(res, vdupq_n_s32(output_activation_min));
+            res = vminq_s32(res, vdupq_n_s32(output_activation_max));
+            vstrbq_p_s32(out, res, p);
+            out += 4;
+            idx++;
+            out_ch -= 4;
+            ch_count--;
+        }
+    }
+
+#elif defined(ARM_MATH_LOOPUNROLL) && defined(ARM_MATH_DSP)
     /* Run the following code in cores using DSP extension */
     (void)dilation_x;
     (void)dilation_y;
-
-    int16_t i_out_y, i_out_x;
-    int16_t i_ker_y, i_ker_x;
     q15_t *const col_buffer_start = buffer_a;
     q15_t *col_buffer = col_buffer_start;
     const int32_t *const bias_start_pos = bias;
@@ -97,11 +191,10 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
     uint16_t row_count;
     uint16_t row_shift;
 
-
-    for (i_out_y = 0; i_out_y < output_y; i_out_y++)
+    for (int i_out_y = 0; i_out_y < output_y; i_out_y++)
     {
         const int16_t base_idx_y = (i_out_y * stride_y) - pad_y;
-        for (i_out_x = 0; i_out_x < output_x; i_out_x++)
+        for (int i_out_x = 0; i_out_x < output_x; i_out_x++)
         {
             const int16_t base_idx_x = (i_out_x * stride_x) - pad_x;
 
@@ -118,11 +211,11 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
                 index += (kernel_x * input_ch) * ker_y_start;
             }
 
-            for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
+            for (int i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
             {
                 const int32_t idx_y = base_idx_y + i_ker_y;
 
-                for (i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
+                for (int i_ker_x = 0; i_ker_x < kernel_x; i_ker_x++)
                 {
                     const int32_t idx_x = base_idx_x + i_ker_x;
                     if (idx_x < 0 || idx_x >= input_x)
@@ -151,7 +244,7 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
 
             while (row_count)
             {
-                q31_t sum =   *bias++;
+                q31_t sum = *bias++;
                 q31_t sum_2 = *bias++;
                 q31_t sum_3 = *bias++;
                 q31_t sum_4 = *bias++;
@@ -306,7 +399,7 @@ arm_status arm_depthwise_conv_s8_opt(const q7_t *input,
                                  dilation_x,
                                  dilation_y,
                                  NULL);
-#endif /* ARM_MATH_DSP & ARM_MATH_LOOPUNROLL*/
+#endif /* ARM_MATH_MVEI | (ARM_MATH_DSP & ARM_MATH_LOOPUNROLL) */
 
     /* Return to application */
     return ARM_MATH_SUCCESS;
@@ -316,8 +409,10 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const uint16_t input_ch,
                                                   const uint16_t kernel_x,
                                                   const uint16_t kernel_y)
 {
-#if defined(ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+#if defined(ARM_MATH_MVEI)
     return (2 * input_ch * kernel_x * kernel_y) * sizeof(int16_t);
+#elif defined(ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+    return (input_ch * kernel_x * kernel_y) * sizeof(int16_t);
 #else
     (void)input_ch;
     (void)kernel_x;
@@ -329,4 +424,3 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const uint16_t input_ch,
 /**
  * @} end of NNConv group
  */
-

+ 219 - 0
CMSIS/NN/Source/ConvolutionFunctions/arm_nn_depthwise_conv_s8_core.c

@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_depthwise_conv_s8_core.c
+ * Description:  Depthwise convolution on im2col buffers.
+ *
+ * $Date:        November 2019
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M cores
+ * -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+#include "arm_nnfunctions.h"
+
+/*
+   * Depthwise conv on an im2col buffer where the input channel equals
+   * output channel.
+   *
+   * Refer header file for details.
+   *
+   */
+
+q7_t *arm_nn_depthwise_conv_s8_core(const q7_t *row,
+                                    const q15_t *col,
+                                    const uint16_t num_ch,
+                                    const int32_t *out_shift,
+                                    const int32_t *out_mult,
+                                    const int32_t out_offset,
+                                    const int32_t activation_min,
+                                    const int32_t activation_max,
+                                    const uint16_t kernel_size,
+                                    const int32_t *const output_bias,
+                                    q7_t *out)
+{
+#if defined(ARM_MATH_MVEI)
+    int32_t ch_per_loop = num_ch / 4;
+
+    const int32_t *bias = output_bias;
+    int8_t *out_tmp = out;
+
+    int32_t idx = 0;
+
+    while (ch_per_loop > 0)
+    {
+        int32x4_t ip_0;
+        int32x4_t ip_1;
+        int32_t ker_loop = kernel_size / 3;
+        int32x4_t out_0 = vldrwq_s32(bias);
+        int32x4_t out_1 = out_0;
+        bias += 4;
+
+        const int32_t offset = idx * 4;
+        const int8_t *row_0 = row + offset;
+        const int16_t *col_0 = col + offset;
+        const int16_t *col_1 = col + kernel_size * num_ch;
+
+        int32x4_t ker_0 = vldrbq_s32(row_0);
+
+        while (ker_loop > 0)
+        {
+            const int8_t *row_1 = row_0 + num_ch;
+            const int8_t *row_2 = row_0 + 2 * num_ch;
+            const int32x4_t ker_1 = vldrbq_s32(row_1);
+            const int32x4_t ker_2 = vldrbq_s32(row_2);
+
+            ip_0 = vldrhq_s32(col_0);
+            ip_1 = vldrhq_s32(col_1);
+            col_0 += num_ch;
+            col_1 += num_ch;
+
+            out_0 += vmulq_s32(ip_0, ker_0);
+            out_1 += vmulq_s32(ip_1, ker_0);
+
+            ip_0 = vldrhq_s32(col_0);
+            ip_1 = vldrhq_s32(col_1);
+            col_0 += num_ch;
+            col_1 += num_ch;
+
+            out_0 += vmulq_s32(ip_0, ker_1);
+            out_1 += vmulq_s32(ip_1, ker_1);
+
+            ip_0 = vldrhq_s32(col_0);
+            ip_1 = vldrhq_s32(col_1);
+            col_0 += num_ch;
+            col_1 += num_ch;
+
+            out_0 += vmulq_s32(ip_0, ker_2);
+            out_1 += vmulq_s32(ip_1, ker_2);
+            row_0 += 3 * num_ch;
+
+            ker_0 = vldrbq_s32(row_0);
+            ker_loop--;
+        }
+
+        idx++;
+        /* Handle tail kernel elements */
+        ker_loop = kernel_size - ((kernel_size / 3) * 3);
+        while (ker_loop > 0)
+        {
+            ip_0 = vldrhq_s32(col_0);
+            ip_1 = vldrhq_s32(col_1);
+
+            out_0 += vmulq_s32(ip_0, ker_0);
+            out_1 += vmulq_s32(ip_1, ker_0);
+
+            col_0 += num_ch;
+            col_1 += num_ch;
+
+            ip_0 = vldrhq_s32(col_0);
+            ip_1 = vldrhq_s32(col_1);
+
+            row_0 += num_ch;
+            ker_0 = vldrbq_s32(row_0);
+            ker_loop--;
+        }
+        const int32x4_t mult = vldrwq_s32(out_mult);
+        const int32x4_t shift = vldrwq_s32(out_shift);
+        out_mult += 4;
+        out_shift += 4;
+
+        out_0 = arm_mve_requantize_32x4(out_0, mult, shift);
+        out_1 = arm_mve_requantize_32x4(out_1, mult, shift);
+
+        out_0 = vaddq_n_s32(out_0, out_offset);
+        out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
+        out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
+        vstrbq_s32(out_tmp, out_0);
+
+        out_1 = vaddq_n_s32(out_1, out_offset);
+        out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
+        out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
+        vstrbq_s32(out_tmp + num_ch, out_1);
+
+        out_tmp += 4;
+        ch_per_loop--;
+    }
+
+    int32_t tail_ch = num_ch & 3;
+    if (tail_ch != 0)
+    {
+        int32_t ch_idx = (num_ch & ~3);
+        int32x4_t col_0_sum;
+        int32x4_t col_1_sum;
+
+        const int32_t single_buffer_size = kernel_size * num_ch;
+        for (int i = 0; i < tail_ch; i++)
+        {
+            const int16_t *col_pos_0 = col + ch_idx;
+            const int16_t *col_pos_1 = col_pos_0 + single_buffer_size;
+
+            const int8_t *row_pos_0 = row + ch_idx;
+            const int8_t *row_pos_1 = row_pos_0;
+            int32_t sum_0 = bias[i];
+            int32_t sum_1 = bias[i];
+
+            for (int i = 0; i < kernel_size; i++)
+            {
+                sum_0 += row_pos_0[i * num_ch] * col_pos_0[i * num_ch];
+                sum_1 += row_pos_1[i * num_ch] * col_pos_1[i * num_ch];
+            }
+            col_0_sum[i] = sum_0;
+            col_1_sum[i] = sum_1;
+
+            ch_idx++;
+        }
+        const mve_pred16_t p = vctp32q(tail_ch);
+        const int32x4_t mult = vldrwq_z_s32(out_mult, p);
+        const int32x4_t shift = vldrwq_z_s32(out_shift, p);
+
+        col_0_sum = arm_mve_requantize_32x4(col_0_sum, mult, shift);
+        col_1_sum = arm_mve_requantize_32x4(col_1_sum, mult, shift);
+
+        col_0_sum = vaddq_n_s32(col_0_sum, out_offset);
+        col_0_sum = vmaxq_s32(col_0_sum, vdupq_n_s32(activation_min));
+        col_0_sum = vminq_s32(col_0_sum, vdupq_n_s32(activation_max));
+        vstrbq_p_s32(out_tmp, col_0_sum, p);
+
+        col_1_sum = vaddq_n_s32(col_1_sum, out_offset);
+        col_1_sum = vmaxq_s32(col_1_sum, vdupq_n_s32(activation_min));
+        col_1_sum = vminq_s32(col_1_sum, vdupq_n_s32(activation_max));
+        vstrbq_p_s32(out_tmp + num_ch, col_1_sum, p);
+
+        out_tmp += tail_ch;
+    }
+
+    return out_tmp + num_ch;
+#else
+    (void)row;
+    (void)col;
+    (void)num_ch;
+    (void)out_shift;
+    (void)out_mult;
+    (void)out_offset;
+    (void)activation_min;
+    (void)activation_max;
+    (void)kernel_size;
+    (void)output_bias;
+    (void)out;
+    return NULL;
+#endif
+}