| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- /*
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- /* ----------------------------------------------------------------------
- * Project: CMSIS NN Library
- * Title: arm_convolve_1x1_s8_fast.c
- * Description: Fast q7 version of 1x1 convolution (non-square shape)
- *
- * $Date: 7 February 2020
- * $Revision: V.1.0.2
- *
- * Target Processor: Cortex-M cores
- *
- * -------------------------------------------------------------------- */
- #include "arm_nnfunctions.h"
- #define DIM_KER_X (1U)
- #define DIM_KER_Y (1U)
- /**
- * @ingroup groupNN
- */
- /**
- * @addtogroup NNConv
- * @{
- */
- /*
- * Fast s8 version for 1x1 convolution (non-square shape)
- *
- * Refer header file for details.
- *
- */
- arm_status arm_convolve_1x1_s8_fast(const q7_t *input,
- const uint16_t input_x,
- const uint16_t input_y,
- const uint16_t input_ch,
- const uint16_t input_batches,
- const q7_t *kernel,
- const uint16_t output_ch,
- const uint16_t pad_x,
- const uint16_t pad_y,
- const uint16_t stride_x,
- const uint16_t stride_y,
- const int32_t *bias,
- q7_t *output,
- const int32_t *output_shift,
- const int32_t *output_mult,
- const int32_t out_offset,
- const int32_t input_offset,
- const int32_t out_activation_min,
- const int32_t out_activation_max,
- const uint16_t output_x,
- const uint16_t output_y,
- q15_t *buffer_a)
- {
- if (input_ch % 4 != 0 ||
- pad_x != 0 || pad_y != 0 ||
- stride_x != 1 || stride_y != 1)
- {
- return ARM_MATH_SIZE_MISMATCH;
- }
- #if defined(ARM_MATH_MVEI)
- (void)buffer_a;
- int32_t col_len = input_x * input_y * input_batches;
- for (int i_items = 0; i_items <= (col_len - 4); i_items += 4)
- {
- for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
- {
- int32_t sum_row = 0;
- int32_t temp_out[4];
- (void)arm_nn_mat_mul_core_4x_s8(input_ch,
- input_ch,
- input + i_items * input_ch,
- kernel + i_out_ch * input_ch,
- &sum_row,
- temp_out);
- int32x4_t res = vldrwq_s32(temp_out);
- res = vaddq_n_s32(res, bias[i_out_ch]);
- sum_row = sum_row * input_offset;
- res = vaddq_n_s32(res, sum_row);
- res = arm_requantize_mve(res, output_mult[i_out_ch], output_shift[i_out_ch]);
- res = vaddq_n_s32(res, out_offset);
- res = vmaxq_s32(res, vdupq_n_s32(out_activation_min));
- res = vminq_s32(res, vdupq_n_s32(out_activation_max));
- const uint32x4_t scatter_offset = {0, output_ch, output_ch * 2, output_ch * 3};
- vstrbq_scatter_offset_s32(output, scatter_offset, res);
- output++;
- }
- output += (3 * output_ch);
- }
- /* Handle left over elements */
- for (int i_items = (col_len & ~0x3); i_items < col_len; i_items++)
- {
- for (int i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
- {
- int32_t sum_row = 0;
- int32_t acc;
- (void)arm_nn_mat_mul_core_1x_s8(input_ch,
- input + i_items * input_ch,
- kernel + i_out_ch * input_ch,
- &sum_row,
- &acc);
- acc += bias[i_out_ch];
- sum_row = (sum_row * input_offset);
- acc += sum_row;
- acc = arm_nn_requantize(acc, output_mult[i_out_ch], output_shift[i_out_ch]);
- acc += out_offset;
- acc = MAX(acc, out_activation_min);
- acc = MIN(acc, out_activation_max);
- *output++ = acc;
- }
- }
- #else
- /* Run the following code as reference implementation for Cortex-M processors with or without DSP extension */
- (void)input_x;
- (void)input_y;
- (void)output_x;
- (void)output_y;
- (void)buffer_a;
- const int32_t lhs_rows = input_x * input_y * input_batches;
- const int32_t rhs_rows = output_ch;
- const int32_t rhs_cols = input_ch;
- arm_nn_mat_mult_nt_t_s8(input,
- kernel,
- bias,
- output,
- output_mult,
- output_shift,
- lhs_rows,
- rhs_rows,
- rhs_cols,
- input_offset,
- out_offset,
- out_activation_min,
- out_activation_max);
- #endif
- /* Return to application */
- return ARM_MATH_SUCCESS;
- }
- int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const uint16_t input_ch)
- {
- (void)input_ch;
- return 0;
- }
- /**
- * @} end of NNConv group
- */
|