6 năm trước cách đây · 8851a4e98e
--- a/CMSIS/NN/Include/arm_nnfunctions.h
+++ b/CMSIS/NN/Include/arm_nnfunctions.h
@@ -1186,9 +1186,9 @@ extern    "C"
 
				    * @param[in]       act_min            Min clamping
			
 
				    * @param[in]       act_max            Max clamping
			
 
				    * @param[in]       ch_im_in           number of input tensor channels
			
 
				-   * @param[in,out]   Im_in              pointer to input tensor
			
 
				+   * @param[in,out]   src                pointer to input tensor
			
 
				    * @param[in,out]   bufferA            temp buffer
			
 
				-   * @param[in,out]   Im_out             pointer to output tensor
			
 
				+   * @param[in,out]   dst                pointer to output tensor
			
 
				    * @return none.
			
 
				    *
			
 
				    * @details
			
@@ -1210,9 +1210,9 @@ arm_avgpool_s8( const int dim_im_in_height,
 
				   const int act_min,
			
 
				   const int act_max,
			
 
				   const int ch_im_in,
			
 
				-  int8_t *Im_in,
			
 
				+  int8_t *src,
			
 
				   int16_t *bufferA,
			
 
				-  int8_t *Im_out);
			
 
				+  int8_t *dst);
			
 
				 
			
 
				 /**
			
 
				  * @defgroup Softmax Softmax Functions
			
--- a/CMSIS/NN/Include/arm_nnsupportfunctions.h
+++ b/CMSIS/NN/Include/arm_nnsupportfunctions.h
@@ -136,6 +136,24 @@ void      arm_q7_to_q15_reordered_no_shift(const q7_t * pSrc, q15_t * pDst, uint
 
				  */
			
 
				 void arm_q7_to_q15_with_offset(const q7_t *src, q15_t *dst, uint32_t block_size, q7_t offset);
			
 
				 
			
 
				+/**
			
 
				+ * @brief Converts the elements from a q7 vector and accumulate to a q15 vector
			
 
				+ * @param[in]    *src       points to the q7 input vector
			
 
				+ * @param[out]   *dst       points to the q15 output vector
			
 
				+ * @param[in]    block_size length of the input vector
			
 
				+ * @return none.
			
 
				+ *
			
 
				+ * \par Description:
			
 
				+ *
			
 
				+ * The equation used for the conversion process is:
			
 
				+ *
			
 
				+ * <pre>
			
 
				+ *  dst[n] += (q15_t) src[n] ;   0 <= n < block_size.
			
 
				+ * </pre>
			
 
				+ *
			
 
				+ */
			
 
				+void arm_nn_accumulate_q7_to_q15(q15_t *dst, const q7_t *src, uint32_t block_size);
			
 
				+
			
 
				 #if defined (ARM_MATH_DSP)
			
 
				 
			
 
				 /**
			
@@ -369,6 +387,19 @@ __STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia(const q7_t **in_q7)
 
				   return (val);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+  @brief         Read 2 q15 from q15 pointer.
			
 
				+  @param[in]     in_q15       Pointer to pointer that holds address of input.
			
 
				+  @return        q31 value
			
 
				+ */
			
 
				+__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2(const q15_t *in_q15)
			
 
				+{
			
 
				+  q31_t val;
			
 
				+  memcpy(&val, in_q15, 4);
			
 
				+
			
 
				+  return (val);
			
 
				+}
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
+++ b/CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c
@@ -0,0 +1,76 @@
 
				+/*
			
 
				+ * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
			
 
				+ *
			
 
				+ * SPDX-License-Identifier: Apache-2.0
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the License); you may
			
 
				+ * not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ * www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
			
 
				+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+ * See the License for the specific language governing permissions and
			
 
				+ * limitations under the License.
			
 
				+ */
			
 
				+
			
 
				+/* ----------------------------------------------------------------------
			
 
				+ * Project:      CMSIS NN Library
			
 
				+ * Title:        arm_nn_accumulate_q7_to_q15.c
			
 
				+ * Description:  Accumulate q7 vector into q15 one.
			
 
				+ *
			
 
				+ * $Date:        July 2019
			
 
				+ * $Revision:    V.1.0.0
			
 
				+ *
			
 
				+ * pSrc Processor:  Cortex-M cores
			
 
				+ *
			
 
				+ * -------------------------------------------------------------------- */
			
 
				+#include "arm_math.h"
			
 
				+#include "arm_nnfunctions.h"
			
 
				+
			
 
				+/**
			
 
				+ * @ingroup groupSupport
			
 
				+ */
			
 
				+
			
 
				+void arm_nn_accumulate_q7_to_q15(q15_t * pDst, const q7_t * pSrc, uint32_t length)
			
 
				+{
			
 
				+    q15_t    *pCnt = pDst;
			
 
				+    const q7_t     *pV = pSrc;
			
 
				+    q31_t     v1, v2, vo1, vo2;
			
 
				+    uint16_t  cnt = length >> 2;
			
 
				+    q31_t     in;
			
 
				+
			
 
				+    while (cnt > 0u)
			
 
				+    {
			
 
				+        q31_t     value = *__SIMD32(pV)++;
			
 
				+        v1 = __SXTB16(__ROR(value, 8));
			
 
				+        v2 = __SXTB16(value);
			
 
				+#ifndef ARM_MATH_BIG_ENDIAN
			
 
				+
			
 
				+        vo2 = __PKHTB(v1, v2, 16);
			
 
				+        vo1 = __PKHBT(v2, v1, 16);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+        vo1 = __PKHTB(v1, v2, 16);
			
 
				+        vo2 = __PKHBT(v2, v1, 16);
			
 
				+
			
 
				+#endif
			
 
				+ 
			
 
				+        in = arm_nn_read_q15x2(pCnt);
			
 
				+        write_q15x2_ia(&pCnt, __QADD16(vo1, in));
			
 
				+
			
 
				+        in = arm_nn_read_q15x2(pCnt);
			
 
				+        write_q15x2_ia(&pCnt,__QADD16(vo2, in));
			
 
				+
			
 
				+        cnt--;
			
 
				+    }
			
 
				+    cnt = length & 0x3;
			
 
				+    while (cnt > 0u)
			
 
				+    {
			
 
				+        *pCnt++ += *pV++;
			
 
				+        cnt--;
			
 
				+    }
			
 
				+}
			
--- a/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
+++ b/CMSIS/NN/Source/PoolingFunctions/arm_avgpool_s8.c
@@ -1,5 +1,5 @@
 
				 /*
			
 
				- * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
			
 
				+ * Copyright (C) 2010-2019 Arm Limited or its affiliates. All rights reserved.
			
 
				  *
			
 
				  * SPDX-License-Identifier: Apache-2.0
			
 
				  *
			
@@ -18,13 +18,13 @@
 
				 
			
 
				 /* ----------------------------------------------------------------------
			
 
				  * Project:      CMSIS NN Library
			
 
				- * Title:        arm_pool_q7_HWC.c
			
 
				+ * Title:        arm_avgpool_s8.c
			
 
				  * Description:  Pooling function implementations
			
 
				  *
			
 
				- * $Date:        17. January 2018
			
 
				+ * $Date:        29. July 2019
			
 
				  * $Revision:    V.1.0.0
			
 
				  *
			
 
				- * Target Processor:  Cortex-M cores
			
 
				+ * Target Processor:  Cortex-M and Cortex-A cores
			
 
				  *
			
 
				  * -------------------------------------------------------------------- */
			
 
				 
			
@@ -34,10 +34,10 @@
 
				 
			
 
				   /**
			
 
				    * @brief Q7 average pooling function
			
 
				-   * @param[in]       dim_im_in_height   input tensor dimention
			
 
				-   * @param[in]       dim_im_in_width    input tensor dimention
			
 
				-   * @param[in]       dim_im_out_height  output tensor dimension
			
 
				-   * @param[in]       dim_im_out_width   output tensor dimension
			
 
				+   * @param[in]       dim_src_height   input tensor dimention
			
 
				+   * @param[in]       dim_src_width    input tensor dimention
			
 
				+   * @param[in]       dim_dst_height  output tensor dimension
			
 
				+   * @param[in]       dim_dst_width   output tensor dimension
			
 
				    * @param[in]       stride_height      stride
			
 
				    * @param[in]       stride_width       stride
			
 
				    * @param[in]       dim_kernel_height  filter kernel size
			
@@ -46,57 +46,16 @@
 
				    * @param[in]       padding_width      padding sizes
			
 
				    * @param[in]       act_min            Min clamping
			
 
				    * @param[in]       act_max            Max clamping
			
 
				-   * @param[in]       ch_im_in           number of input tensor channels
			
 
				-   * @param[in,out]   Im_in              pointer to input tensor
			
 
				+   * @param[in]       ch_src           number of input tensor channels
			
 
				+   * @param[in,out]   src              pointer to input tensor
			
 
				    * @param[in,out]   bufferA            temp buffer
			
 
				-   * @param[in,out]   Im_out             pointer to output tensor
			
 
				+   * @param[in,out]   dst             pointer to output tensor
			
 
				    * @return none.
			
 
				    *
			
 
				    * @details
			
 
				    *
			
 
				    *
			
 
				    */
			
 
				-#if defined (ARM_MATH_DSP)
			
 
				-static void accumulate_q7_to_q15(q15_t * base, q7_t * target, const uint16_t length)
			
 
				-{
			
 
				-    q15_t    *pCnt = base;
			
 
				-    q7_t     *pV = target;
			
 
				-    q31_t     v1, v2, vo1, vo2;
			
 
				-    uint16_t  cnt = length >> 2;
			
 
				-    q31_t     in;
			
 
				-
			
 
				-    while (cnt > 0u)
			
 
				-    {
			
 
				-        q31_t     value = *__SIMD32(pV)++;
			
 
				-        v1 = __SXTB16(__ROR(value, 8));
			
 
				-        v2 = __SXTB16(value);
			
 
				-#ifndef ARM_MATH_BIG_ENDIAN
			
 
				-
			
 
				-        vo2 = __PKHTB(v1, v2, 16);
			
 
				-        vo1 = __PKHBT(v2, v1, 16);
			
 
				-
			
 
				-#else
			
 
				-
			
 
				-        vo1 = __PKHTB(v1, v2, 16);
			
 
				-        vo2 = __PKHBT(v2, v1, 16);
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-        in = *__SIMD32(pCnt);
			
 
				-        *__SIMD32(pCnt)++ = __QADD16(vo1, in);
			
 
				-
			
 
				-        in = *__SIMD32(pCnt);
			
 
				-        *__SIMD32(pCnt)++ = __QADD16(vo2, in);
			
 
				-
			
 
				-        cnt--;
			
 
				-    }
			
 
				-    cnt = length & 0x3;
			
 
				-    while (cnt > 0u)
			
 
				-    {
			
 
				-        *pCnt++ += *pV++;
			
 
				-        cnt--;
			
 
				-    }
			
 
				-}
			
 
				 
			
 
				 static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t length, uint16_t scale)
			
 
				 {
			
@@ -109,6 +68,8 @@ static void buffer_scale_back_q15_to_q7(q15_t * buffer, q7_t * target, uint16_t
 
				     }
			
 
				 }
			
 
				 
			
 
				+#if defined (ARM_MATH_DSP)
			
 
				+
			
 
				 static void buffer_scale_back_q15_to_q7_and_clamp(q15_t * buffer, q7_t * target, uint16_t length, uint16_t count,const int act_min,
			
 
				   const int act_max)
			
 
				 {
			
@@ -128,10 +89,10 @@ static void buffer_scale_back_q15_to_q7_and_clamp(q15_t * buffer, q7_t * target,
 
				 #endif
			
 
				 
			
 
				 void
			
 
				-arm_avgpool_s8( const int dim_im_in_height,
			
 
				-  const int dim_im_in_width,
			
 
				-  const int dim_im_out_height,
			
 
				-  const int dim_im_out_width,
			
 
				+arm_avgpool_s8( const int dim_src_height,
			
 
				+  const int dim_src_width,
			
 
				+  const int dim_dst_height,
			
 
				+  const int dim_dst_width,
			
 
				   const int stride_height,
			
 
				   const int stride_width,
			
 
				   const int dim_kernel_height,
			
@@ -140,13 +101,13 @@ arm_avgpool_s8( const int dim_im_in_height,
 
				   const int padding_width,
			
 
				   const int act_min,
			
 
				   const int act_max,
			
 
				-  const int ch_im_in,
			
 
				-  int8_t *Im_in,
			
 
				+  const int ch_src,
			
 
				+  int8_t *src,
			
 
				   int16_t *bufferA,
			
 
				-  int8_t *Im_out)
			
 
				+  int8_t *dst)
			
 
				 {
			
 
				 
			
 
				-#if defined (ARM_MATH_DSP)
			
 
				+#if defined(ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
			
 
				 
			
 
				     /* Run the following code for Cortex-M4 and Cortex-M7 */
			
 
				 
			
@@ -155,13 +116,13 @@ arm_avgpool_s8( const int dim_im_in_height,
 
				     int16_t   count = 0;
			
 
				 
			
 
				     /* first does the pooling along x axis */
			
 
				-    for (i_y = 0; i_y < dim_im_in_height; i_y++)
			
 
				+    for (i_y = 0; i_y < dim_src_height; i_y++)
			
 
				     {
			
 
				 
			
 
				-        for (i_x = 0; i_x < dim_im_out_width; i_x++)
			
 
				+        for (i_x = 0; i_x < dim_dst_width; i_x++)
			
 
				         {
			
 
				-            /* for each output pixel */
			
 
				-            q7_t     *target = Im_in + (i_y * dim_im_in_width + i_x) * ch_im_in;
			
 
				+            /* for each output sample */
			
 
				+            q7_t     *target = src + (i_y * dim_src_width + i_x) * ch_src;
			
 
				             q7_t     *win_start;
			
 
				             q7_t     *win_stop;
			
 
				             if (i_x * stride_width - padding_width < 0)
			
@@ -169,71 +130,70 @@ arm_avgpool_s8( const int dim_im_in_height,
 
				                 win_start = target;
			
 
				             } else
			
 
				             {
			
 
				-                win_start = Im_in + (i_y * dim_im_in_width + i_x * stride_width - padding_width) * ch_im_in;
			
 
				+                win_start = src + (i_y * dim_src_width + i_x * stride_width - padding_width) * ch_src;
			
 
				             }
			
 
				 
			
 
				-            if (i_x * stride_width - padding_width + dim_kernel_width >= dim_im_in_width)
			
 
				+            if (i_x * stride_width - padding_width + dim_kernel_width >= dim_src_width)
			
 
				             {
			
 
				-                win_stop = Im_in + (i_y * dim_im_in_width + dim_im_in_width) * ch_im_in;
			
 
				+                win_stop = src + (i_y * dim_src_width + dim_src_width) * ch_src;
			
 
				             } else
			
 
				             {
			
 
				-                win_stop = Im_in + (i_y * dim_im_in_width + i_x * stride_width - padding_width + dim_kernel_width) * ch_im_in;
			
 
				+                win_stop = src + (i_y * dim_src_width + i_x * stride_width - padding_width + dim_kernel_width) * ch_src;
			
 
				             }
			
 
				             /* first step is to copy over initial data */
			
 
				-            arm_q7_to_q15_no_shift(win_start, buffer, ch_im_in);
			
 
				+            arm_q7_to_q15_no_shift(win_start, buffer, ch_src);
			
 
				             count = 1;
			
 
				 
			
 
				-            /* start the max operation from the second part */
			
 
				-            win_start += ch_im_in;
			
 
				-            for (; win_start < win_stop; win_start += ch_im_in)
			
 
				+            /* start the average operation from the second part */
			
 
				+            win_start += ch_src;
			
 
				+            for (; win_start < win_stop; win_start += ch_src)
			
 
				             { 
			
 
				-                accumulate_q7_to_q15(buffer, win_start, ch_im_in);
			
 
				+                arm_nn_accumulate_q7_to_q15(buffer, win_start, ch_src);
			
 
				                 count++;
			
 
				             }
			
 
				-            buffer_scale_back_q15_to_q7(buffer, target, ch_im_in, count);
			
 
				+            buffer_scale_back_q15_to_q7(buffer, target, ch_src, count);
			
 
				         }
			
 
				     }
			
 
				 
			
 
				 
			
 
				     /* then does the pooling along y axis */
			
 
				-    for (i_y = 0; i_y < dim_im_out_height; i_y++)
			
 
				+    for (i_y = 0; i_y < dim_dst_height; i_y++)
			
 
				     {
			
 
				         /* for each output row */
			
 
				-        q7_t     *target = Im_out + i_y * dim_im_out_width * ch_im_in;
			
 
				+        q7_t     *target = dst + i_y * dim_dst_width * ch_src;
			
 
				         q7_t     *row_start;
			
 
				         q7_t     *row_end;
			
 
				         /* setting the starting row */
			
 
				         if (i_y * stride_height - padding_height < 0)
			
 
				         {
			
 
				-            row_start = Im_in;
			
 
				+            row_start = src;
			
 
				         } else
			
 
				         {
			
 
				-            row_start = Im_in + (i_y * stride_height - padding_height) * dim_im_in_width * ch_im_in;
			
 
				+            row_start = src + (i_y * stride_height - padding_height) * dim_src_width * ch_src;
			
 
				         }
			
 
				         /* setting the stopping row */
			
 
				-        if (i_y * stride_height - padding_height + dim_kernel_height >= dim_im_in_height)
			
 
				+        if (i_y * stride_height - padding_height + dim_kernel_height >= dim_src_height)
			
 
				         {
			
 
				-            row_end = Im_in + dim_im_in_height * dim_im_in_width * ch_im_in;
			
 
				+            row_end = src + dim_src_height * dim_src_width * ch_src;
			
 
				         } else
			
 
				         {
			
 
				-            row_end = Im_in + (i_y * stride_height - padding_height + dim_kernel_height) * dim_im_in_width * ch_im_in;
			
 
				+            row_end = src + (i_y * stride_height - padding_height + dim_kernel_height) * dim_src_width * ch_src;
			
 
				         }
			
 
				 
			
 
				         /* copy over the first row */
			
 
				-        arm_q7_to_q15_no_shift(row_start, buffer, dim_im_out_width * ch_im_in);
			
 
				+        arm_q7_to_q15_no_shift(row_start, buffer, dim_dst_width * ch_src);
			
 
				         count = 1;
			
 
				-        //("sum %d\n",buffer[0]);
			
 
				 
			
 
				         /* move over to next row */
			
 
				-        row_start += ch_im_in * dim_im_in_width;
			
 
				+        row_start += ch_src * dim_src_width;
			
 
				 
			
 
				-        for (; row_start < row_end; row_start += dim_im_in_width * ch_im_in)
			
 
				+        for (; row_start < row_end; row_start += dim_src_width * ch_src)
			
 
				         {
			
 
				-            accumulate_q7_to_q15(buffer, row_start, dim_im_out_width * ch_im_in);
			
 
				+            arm_nn_accumulate_q7_to_q15(buffer, row_start, dim_dst_width * ch_src);
			
 
				 
			
 
				             count++;
			
 
				         }
			
 
				-        buffer_scale_back_q15_to_q7_and_clamp(buffer, target, dim_im_out_width * ch_im_in, count,act_min,act_max);
			
 
				+        buffer_scale_back_q15_to_q7_and_clamp(buffer, target, dim_dst_width * ch_src, count,act_min,act_max);
			
 
				     }
			
 
				 
			
 
				 #else
			
@@ -244,11 +204,11 @@ arm_avgpool_s8( const int dim_im_in_height,
 
				     int16_t   k_x, k_y;
			
 
				 
			
 
				     
			
 
				-    for (i_y = 0; i_y < dim_im_out_height; i_y++)
			
 
				+    for (i_y = 0; i_y < dim_dst_height; i_y++)
			
 
				     {
			
 
				-        for (i_x = 0; i_x < dim_im_out_width; i_x++)
			
 
				+        for (i_x = 0; i_x < dim_dst_width; i_x++)
			
 
				         {
			
 
				-            for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
			
 
				+            for (i_ch_in = 0; i_ch_in < ch_src; i_ch_in++)
			
 
				             {
			
 
				                 int       sum = 0;
			
 
				                 int       count = 0;
			
@@ -256,19 +216,18 @@ arm_avgpool_s8( const int dim_im_in_height,
 
				                 {
			
 
				                     for (k_x = i_x * stride_width - padding_width; k_x < i_x * stride_width - padding_width + dim_kernel_width; k_x++)
			
 
				                     {
			
 
				-                        if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in_height && k_x < dim_im_in_width)
			
 
				+                        if (k_y >= 0 && k_x >= 0 && k_y < dim_src_height && k_x < dim_src_width)
			
 
				                         {
			
 
				-                            sum += Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in_width)];
			
 
				+                            sum += src[i_ch_in + ch_src * (k_x + k_y * dim_src_width)];
			
 
				                             count++;
			
 
				                         }
			
 
				                     }
			
 
				                 }
			
 
				-                // Round to the closest integer value.
			
 
				                 sum = sum > 0 ? (sum + count / 2) / count : (sum - count / 2) / count;
			
 
				                 sum = MAX(sum, act_min);
			
 
				                 sum = MIN(sum, act_max);
			
 
				 
			
 
				-                Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out_width)] = sum;
			
 
				+                dst[i_ch_in + ch_src * (i_x + i_y * dim_dst_width)] = sum;
			
 
				             }
			
 
				         }
			
 
				     }