|
|
@@ -52,6 +52,7 @@ arm_status arm_convolve_s8(const q7_t *input,
|
|
|
const uint16_t input_x,
|
|
|
const uint16_t input_y,
|
|
|
const uint16_t input_ch,
|
|
|
+ const uint16_t input_batches,
|
|
|
const q7_t *kernel,
|
|
|
const uint16_t output_ch,
|
|
|
const uint16_t kernel_x,
|
|
|
@@ -72,156 +73,160 @@ arm_status arm_convolve_s8(const q7_t *input,
|
|
|
const uint16_t output_y,
|
|
|
q15_t *buffer_a)
|
|
|
{
|
|
|
-
|
|
|
+ int i_batch;
|
|
|
+ for (i_batch = 0; i_batch < input_batches; i_batch++)
|
|
|
+ {
|
|
|
+ input += i_batch * (input_x * input_y * input_ch);
|
|
|
+ output += i_batch * (output_x * output_y * output_ch);
|
|
|
#if defined(ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
|
|
|
- int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
|
|
|
+ int16_t i_out_y, i_out_x, i_ker_y, i_ker_x;
|
|
|
|
|
|
- /* Generate two columns from the input tensor a GEMM computation */
|
|
|
- q15_t *two_column_buf = buffer_a;
|
|
|
- q7_t *out = output;
|
|
|
+ /* Generate two columns from the input tensor a GEMM computation */
|
|
|
+ q15_t *two_column_buf = buffer_a;
|
|
|
+ q7_t *out = output;
|
|
|
|
|
|
- /* This part implements the im2col function */
|
|
|
- for (i_out_y = 0; i_out_y < output_y; i_out_y++)
|
|
|
- {
|
|
|
- for (i_out_x = 0; i_out_x < output_x; i_out_x++)
|
|
|
+ /* This part implements the im2col function */
|
|
|
+ for (i_out_y = 0; i_out_y < output_y; i_out_y++)
|
|
|
{
|
|
|
- for (i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; i_ker_y++)
|
|
|
+ for (i_out_x = 0; i_out_x < output_x; i_out_x++)
|
|
|
{
|
|
|
- for (i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; i_ker_x++)
|
|
|
+ for (i_ker_y = i_out_y * stride_y - pad_y; i_ker_y < i_out_y * stride_y - pad_y + kernel_y; i_ker_y++)
|
|
|
{
|
|
|
- if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
|
|
|
+ for (i_ker_x = i_out_x * stride_x - pad_x; i_ker_x < i_out_x * stride_x - pad_x + kernel_x; i_ker_x++)
|
|
|
{
|
|
|
- /* Filling 0 for out-of-bound paddings */
|
|
|
- memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- /* Copying the pixel data to column */
|
|
|
- arm_q7_to_q15_with_offset(input + (i_ker_y * input_x + i_ker_x) * input_ch, two_column_buf, input_ch, input_offset);
|
|
|
+ if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
|
|
|
+ {
|
|
|
+ /* Filling 0 for out-of-bound paddings */
|
|
|
+ memset(two_column_buf, 0, sizeof(q15_t) * input_ch);
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ /* Copying the pixel data to column */
|
|
|
+ arm_q7_to_q15_with_offset(input + (i_ker_y * input_x + i_ker_x) * input_ch, two_column_buf, input_ch, input_offset);
|
|
|
+ }
|
|
|
+ two_column_buf += input_ch;
|
|
|
}
|
|
|
- two_column_buf += input_ch;
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- /* Computation is filed for every 2 columns */
|
|
|
- if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
|
|
|
- {
|
|
|
- out =
|
|
|
- arm_nn_mat_mult_kernel_s8_s16(kernel,
|
|
|
- buffer_a,
|
|
|
- output_ch,
|
|
|
- output_shift,
|
|
|
- output_mult,
|
|
|
- out_offset,
|
|
|
- out_activation_min,
|
|
|
- out_activation_max,
|
|
|
- input_ch * kernel_y * kernel_x,
|
|
|
- bias,
|
|
|
- out);
|
|
|
-
|
|
|
- /* counter reset */
|
|
|
- two_column_buf = buffer_a;
|
|
|
+ /* Computation is filed for every 2 columns */
|
|
|
+ if (two_column_buf == buffer_a + 2 * input_ch * kernel_y * kernel_x)
|
|
|
+ {
|
|
|
+ out =
|
|
|
+ arm_nn_mat_mult_kernel_s8_s16(kernel,
|
|
|
+ buffer_a,
|
|
|
+ output_ch,
|
|
|
+ output_shift,
|
|
|
+ output_mult,
|
|
|
+ out_offset,
|
|
|
+ out_activation_min,
|
|
|
+ out_activation_max,
|
|
|
+ input_ch * kernel_y * kernel_x,
|
|
|
+ bias,
|
|
|
+ out);
|
|
|
+
|
|
|
+ /* counter reset */
|
|
|
+ two_column_buf = buffer_a;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
-
|
|
|
- /* left-over because odd number of output pixels */
|
|
|
- if (two_column_buf != buffer_a)
|
|
|
- {
|
|
|
- const q7_t *ker_a = kernel;
|
|
|
- int i;
|
|
|
|
|
|
- for (i = 0; i < output_ch; i++)
|
|
|
+ /* left-over because odd number of output pixels */
|
|
|
+ if (two_column_buf != buffer_a)
|
|
|
{
|
|
|
- /* Load the accumulator with bias first */
|
|
|
- q31_t sum = bias[i];
|
|
|
+ const q7_t *ker_a = kernel;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < output_ch; i++)
|
|
|
+ {
|
|
|
+ /* Load the accumulator with bias first */
|
|
|
+ q31_t sum = bias[i];
|
|
|
|
|
|
- /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
|
|
|
- const q15_t *ip_as_col = buffer_a;
|
|
|
+ /* Point to the beginning of the im2col buffer where the input is available as a rearranged column */
|
|
|
+ const q15_t *ip_as_col = buffer_a;
|
|
|
|
|
|
- /* 4 multiply and accumulates are done in one loop. */
|
|
|
- uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
|
|
|
+ /* 4 multiply and accumulates are done in one loop. */
|
|
|
+ uint16_t col_count = (input_ch * kernel_y * kernel_x) >> 2;
|
|
|
|
|
|
- while (col_count)
|
|
|
- {
|
|
|
- q31_t ker_a1, ker_a2;
|
|
|
- q31_t ip_b1, ip_b2;
|
|
|
+ while (col_count)
|
|
|
+ {
|
|
|
+ q31_t ker_a1, ker_a2;
|
|
|
+ q31_t ip_b1, ip_b2;
|
|
|
|
|
|
- ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
|
|
|
+ ker_a = read_and_pad(ker_a, &ker_a1, &ker_a2);
|
|
|
|
|
|
- ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
|
|
|
- sum = __SMLAD(ker_a1, ip_b1, sum);
|
|
|
- ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
|
|
|
- sum = __SMLAD(ker_a2, ip_b2, sum);
|
|
|
+ ip_b1 = arm_nn_read_q15x2_ia(&ip_as_col);
|
|
|
+ sum = __SMLAD(ker_a1, ip_b1, sum);
|
|
|
+ ip_b2 = arm_nn_read_q15x2_ia(&ip_as_col);
|
|
|
+ sum = __SMLAD(ker_a2, ip_b2, sum);
|
|
|
|
|
|
- col_count--;
|
|
|
- }
|
|
|
- /* Handle left over mac */
|
|
|
- col_count = input_ch * kernel_y * kernel_x & 0x3;
|
|
|
- while (col_count)
|
|
|
- {
|
|
|
- q7_t ker_a1 = *ker_a++;
|
|
|
- q15_t ip_b1 = *ip_as_col++;
|
|
|
- sum += ker_a1 * ip_b1;
|
|
|
- col_count--;
|
|
|
- }
|
|
|
+ col_count--;
|
|
|
+ }
|
|
|
+ /* Handle left over mac */
|
|
|
+ col_count = input_ch * kernel_y * kernel_x & 0x3;
|
|
|
+ while (col_count)
|
|
|
+ {
|
|
|
+ q7_t ker_a1 = *ker_a++;
|
|
|
+ q15_t ip_b1 = *ip_as_col++;
|
|
|
+ sum += ker_a1 * ip_b1;
|
|
|
+ col_count--;
|
|
|
+ }
|
|
|
|
|
|
- sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
|
|
|
- sum += out_offset;
|
|
|
- sum = MAX(sum, out_activation_min);
|
|
|
- sum = MIN(sum, out_activation_max);
|
|
|
- *out++ = (q7_t)sum;
|
|
|
+ sum = arm_nn_requantize(sum, output_mult[i], output_shift[i]);
|
|
|
+ sum += out_offset;
|
|
|
+ sum = MAX(sum, out_activation_min);
|
|
|
+ sum = MIN(sum, out_activation_max);
|
|
|
+ *out++ = (q7_t)sum;
|
|
|
+ }
|
|
|
}
|
|
|
- }
|
|
|
#else
|
|
|
- /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
|
|
|
- (void)buffer_a;
|
|
|
- int32_t i_out_ch, i_out_y, i_out_x, i_input_ch, i_ker_y, i_ker_x;
|
|
|
- int32_t conv_out;
|
|
|
- int32_t in_row, in_col;
|
|
|
+ /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
|
|
|
+ (void)buffer_a;
|
|
|
+ int32_t i_out_ch, i_out_y, i_out_x, i_input_ch, i_ker_y, i_ker_x;
|
|
|
+ int32_t conv_out;
|
|
|
+ int32_t in_row, in_col;
|
|
|
|
|
|
- for (i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
|
|
|
- {
|
|
|
- for (i_out_y = 0; i_out_y < output_y; i_out_y++)
|
|
|
+ for (i_out_ch = 0; i_out_ch < output_ch; i_out_ch++)
|
|
|
{
|
|
|
- for (i_out_x = 0; i_out_x < output_x; i_out_x++)
|
|
|
+ for (i_out_y = 0; i_out_y < output_y; i_out_y++)
|
|
|
{
|
|
|
- conv_out = bias[i_out_ch];
|
|
|
+ for (i_out_x = 0; i_out_x < output_x; i_out_x++)
|
|
|
+ {
|
|
|
+ conv_out = bias[i_out_ch];
|
|
|
|
|
|
- const int32_t base_idx_y = stride_y * i_out_y - pad_y;
|
|
|
- const int32_t base_idx_x = stride_x * i_out_x - pad_x;
|
|
|
+ const int32_t base_idx_y = stride_y * i_out_y - pad_y;
|
|
|
+ const int32_t base_idx_x = stride_x * i_out_x - pad_x;
|
|
|
|
|
|
- const int32_t ker_y_start = MAX(0, -base_idx_y);
|
|
|
- const int32_t ker_x_start = MAX(0, -base_idx_x);
|
|
|
+ const int32_t ker_y_start = MAX(0, -base_idx_y);
|
|
|
+ const int32_t ker_x_start = MAX(0, -base_idx_x);
|
|
|
|
|
|
- const int32_t ker_y_end = MIN(kernel_y, input_y - base_idx_y);
|
|
|
- const int32_t ker_x_end = MIN(kernel_x, input_x - base_idx_x);
|
|
|
+ const int32_t ker_y_end = MIN(kernel_y, input_y - base_idx_y);
|
|
|
+ const int32_t ker_x_end = MIN(kernel_x, input_x - base_idx_x);
|
|
|
|
|
|
- for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
|
|
|
- {
|
|
|
- for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
|
|
|
+ for (i_ker_y = ker_y_start; i_ker_y < ker_y_end; i_ker_y++)
|
|
|
{
|
|
|
- const int32_t in_row = base_idx_y + i_ker_y;
|
|
|
- const int32_t in_col = base_idx_x + i_ker_x;
|
|
|
- for (i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
|
|
|
+ for (i_ker_x = ker_x_start; i_ker_x < ker_x_end; i_ker_x++)
|
|
|
{
|
|
|
- conv_out +=
|
|
|
- (input[(in_row * input_x + in_col) * input_ch + i_input_ch] + input_offset) *
|
|
|
- kernel[i_out_ch * input_ch * kernel_y * kernel_x +
|
|
|
- (i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
|
|
|
+ const int32_t in_row = base_idx_y + i_ker_y;
|
|
|
+ const int32_t in_col = base_idx_x + i_ker_x;
|
|
|
+ for (i_input_ch = 0; i_input_ch < input_ch; i_input_ch++)
|
|
|
+ {
|
|
|
+ conv_out +=
|
|
|
+ (input[(in_row * input_x + in_col) * input_ch + i_input_ch] + input_offset) *
|
|
|
+ kernel[i_out_ch * input_ch * kernel_y * kernel_x +
|
|
|
+ (i_ker_y * kernel_x + i_ker_x) * input_ch + i_input_ch];
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
+ conv_out = arm_nn_requantize(conv_out, output_mult[i_out_ch], output_shift[i_out_ch]);
|
|
|
+ conv_out += out_offset;
|
|
|
+ conv_out = MAX(conv_out, out_activation_min);
|
|
|
+ conv_out = MIN(conv_out, out_activation_max);
|
|
|
+ output[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int8_t)conv_out;
|
|
|
}
|
|
|
- conv_out = arm_nn_requantize(conv_out, output_mult[i_out_ch], output_shift[i_out_ch]);
|
|
|
- conv_out += out_offset;
|
|
|
- conv_out = MAX(conv_out, out_activation_min);
|
|
|
- conv_out = MIN(conv_out, out_activation_max);
|
|
|
- output[i_out_ch + (i_out_y * output_x + i_out_x) * output_ch] = (int8_t)conv_out;
|
|
|
}
|
|
|
}
|
|
|
- }
|
|
|
-
|
|
|
#endif
|
|
|
+ }
|
|
|
|
|
|
/* Return to application */
|
|
|
return ARM_MATH_SUCCESS;
|