Kaynağa Gözat

CMSIS-NN: Add MVEI support for int16 depth-wise convolution (#1521)

hmogensen-arm 3 yıl önce
ebeveyn
işleme
f87f7a7a22
75 değiştirilmiş dosya ile 2089 ekleme ve 54 silme
  1. 2 0
      ARM.CMSIS.pdsc
  2. 2 1
      CMSIS/DoxyGen/NN/src/history.txt
  3. 56 7
      CMSIS/NN/Include/arm_nnsupportfunctions.h
  4. 5 5
      CMSIS/NN/README.md
  5. 154 18
      CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c
  6. 3 4
      CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c
  7. 6 7
      CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c
  8. 9 7
      CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c
  9. 3 2
      CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt
  10. 171 0
      CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c
  11. 2 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers/bias.txt
  12. 76 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers/input.txt
  13. 73 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers/kernel.txt
  14. 49 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/input.txt
  15. 49 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/kernel.txt
  16. 17 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_null_bias/input.txt
  17. 33 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_null_bias/kernel.txt
  18. 49 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_spill_null_bias/input.txt
  19. 46 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_spill_null_bias/kernel.txt
  20. 33 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_stride_null_bias/input.txt
  21. 33 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_stride_null_bias/kernel.txt
  22. 2 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_test_bias/bias.txt
  23. 17 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_test_bias/input.txt
  24. 33 0
      CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_test_bias/kernel.txt
  25. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/biases_data.h
  26. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/config_data.h
  27. 48 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/input_data.h
  28. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/output_mult_data.h
  29. 19 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/output_ref_data.h
  30. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/output_shift_data.h
  31. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/test_data.h
  32. 9 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/weights_data.h
  33. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/biases_data.h
  34. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/config_data.h
  35. 33 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/input_data.h
  36. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/output_mult_data.h
  37. 15 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/output_ref_data.h
  38. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/output_shift_data.h
  39. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/test_data.h
  40. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/weights_data.h
  41. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/biases_data.h
  42. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/config_data.h
  43. 15 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/input_data.h
  44. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/output_mult_data.h
  45. 10 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/output_ref_data.h
  46. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/output_shift_data.h
  47. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/test_data.h
  48. 7 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/weights_data.h
  49. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/biases_data.h
  50. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/config_data.h
  51. 23 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/input_data.h
  52. 9 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/output_mult_data.h
  53. 14 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/output_ref_data.h
  54. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/output_shift_data.h
  55. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/test_data.h
  56. 7 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/weights_data.h
  57. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/biases_data.h
  58. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/config_data.h
  59. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/input_data.h
  60. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/output_mult_data.h
  61. 10 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/output_ref_data.h
  62. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/output_shift_data.h
  63. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/test_data.h
  64. 7 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/weights_data.h
  65. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/biases_data.h
  66. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/config_data.h
  67. 15 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/input_data.h
  68. 6 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/output_mult_data.h
  69. 10 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/output_ref_data.h
  70. 5 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/output_shift_data.h
  71. 8 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/test_data.h
  72. 7 0
      CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/weights_data.h
  73. 24 0
      CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_fast_s16/Unity/unity_test_arm_depthwise_conv_fast_s16.c
  74. 533 3
      CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_fast_s16/test_arm_depthwise_conv_fast_s16.c
  75. 34 0
      CMSIS/NN/Tests/UnitTest/generate_test_data.py

+ 2 - 0
ARM.CMSIS.pdsc

@@ -15,6 +15,7 @@
        - Changed return types of all API's
        - Support for int16 average pooling DSP implementation
        - Support for DSP extension optimization for int16 depthwise_conv
+       - Support for MVEI extension optimization for int16 depthwise_conv
        - Support for MVEI extension optimization for int16 max pooling
     </release>
     <release version="5.9.0" date="2022-05-02">
@@ -2297,6 +2298,7 @@ and 8-bit Java bytecodes in Jazelle state.
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_q7_to_q15_with_offset.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_accumulate_q7_to_q15.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mult_nt_t_s8.c"/>
+        <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_mat_mul_kernel_s16.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_padded_s8.c"/>
         <file category="source" name="CMSIS/NN/Source/NNSupportFunctions/arm_nn_add_q7.c"/>

+ 2 - 1
CMSIS/DoxyGen/NN/src/history.txt

@@ -13,6 +13,7 @@
       <li> Replaced arm_status with arm_cmsis_nn_status struct </li>
       <li> Added DSP support in arm_avgpool_s16.c </li>
       <li> Added support for DSP extension optimization for int16 depthwise_conv </li>
+      <li> Added support for MVEI extension optimization for int16 depthwise_conv </li>
       <li> Added support for MVEI extension optimization for int16 max pooling </li>
       </ul>
     </td>
@@ -51,7 +52,7 @@
     <td>V3.0.0</td>
     <td>
     <ul>
-      <li>Updated arm_fully_connected_s8 to use zero weight offset<br> 
+      <li>Updated arm_fully_connected_s8 to use zero weight offset<br>
       as per the TFLM int8 quantization spec. The API is the same but,<br>
       the weight offset parameter is expected to be zero<br> </li>
       <li> Added unit test for Softmax </li>

+ 56 - 7
CMSIS/NN/Include/arm_nnsupportfunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        10 May 2022
- * $Revision:    V.8.1.0
+ * $Date:        6 July 2022
+ * $Revision:    V.8.2.0
  *
  * Target Processor:  Cortex-M CPUs
  * -------------------------------------------------------------------- */
@@ -557,6 +557,43 @@ q7_t *arm_nn_depthwise_conv_nt_t_s8(const q7_t *lhs,
                                     const int32_t *const output_bias,
                                     q7_t *out);
 
+/**
+ * @brief Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases.
+ *        Dimensions are the same for lhs and rhs.
+ *
+ * @param[in]      lhs             Input left-hand side matrix
+ * @param[in]      rhs             Input right-hand side matrix (transposed)
+ * @param[in]      num_ch          Number of channels in LHS/RHS
+ * @param[in]      out_shift       Per channel output shift. Length of vector is equal to number of channels.
+ * @param[in]      out_mult        Per channel output multiplier. Length of vector is equal to number of channels.
+ * @param[in]      activation_min  Minimum value to clamp the output to. Range: int8
+ * @param[in]      activation_max  Maximum value to clamp the output to. Range: int8
+ * @param[in]       row_x_col       (row_dimension * col_dimension) of LHS/RHS matrix
+ * @param[in]      output_bias     Per channel output bias. Length of vector is equal to number of channels.
+ * @param[in]      out             Output pointer
+ *
+ * @return         The function returns one of the two
+ *                  - Updated output pointer if an implementation is available
+ *                  - NULL if no implementation is available.
+ *
+ * @note           If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read
+ * out for the following.
+ *                  - Output shift
+ *                  - Output multiplier
+ *                  - Output bias
+ *                  - rhs
+ */
+int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs,
+                                        const q7_t *rhs,
+                                        const uint16_t num_ch,
+                                        const int32_t *out_shift,
+                                        const int32_t *out_mult,
+                                        const int32_t activation_min,
+                                        const int32_t activation_max,
+                                        const uint16_t row_x_col,
+                                        const int64_t *const output_bias,
+                                        int16_t *out);
+
 /**
  *@brief Matrix-multiplication function for convolution with reordered columns
  *@param[in]       pA          pointer to operand A
@@ -662,8 +699,8 @@ __STATIC_FORCEINLINE void arm_memset_q7(q7_t *dst, const q7_t val, uint32_t bloc
                    "   vstrb.8                 q0, [%[in]], #16            \n"
                    "   letp                    lr, 2b                     \n"
                    "1:                                                    \n"
-                   : [ in ] "+r"(dst)
-                   : [ cnt ] "r"(block_size), [ set_val ] "r"(val)
+                   : [in] "+r"(dst)
+                   : [cnt] "r"(block_size), [set_val] "r"(val)
                    : "q0", "memory", "r14");
 #else
     memset(dst, val, block_size);
@@ -1010,14 +1047,26 @@ __STATIC_FORCEINLINE void arm_memcpy_q7(q7_t *__RESTRICT dst, const q7_t *__REST
                    "   vstrb.8                 q0, [%[out]], #16           \n"
                    "   letp                    lr, 2b                     \n"
                    "1:                                                    \n"
-                   : [ in ] "+r"(src), [ out ] "+r"(dst)
-                   : [ cnt ] "r"(block_size)
+                   : [in] "+r"(src), [out] "+r"(dst)
+                   : [cnt] "r"(block_size)
                    : "q0", "memory", "r14");
 #else
     memcpy(dst, src, block_size);
 #endif
 }
 
+/**
+ * @brief           memcpy wrapper for int16
+ * @param[in, out]  dst         Destination pointer
+ * @param[in]       src         Source pointer.
+ * @param[in]       block_size  Number of bytes to copy.
+ *
+ */
+__STATIC_FORCEINLINE void arm_memcpy_q15(q15_t *__RESTRICT dst, const q15_t *__RESTRICT src, uint32_t block_size)
+{
+    memcpy(dst, src, block_size);
+}
+
 #if defined(ARM_MATH_MVEI)
 /**
  * @brief           Vector saturating doubling high multiply returning high half.

+ 5 - 5
CMSIS/NN/README.md

@@ -28,17 +28,17 @@ Group | API | Base Operator | Input Constraints | Additional memory required for
 ||arm_convolve_wrapper_s8()|CONV| None |n.a.| Yes | Yes |The additional memory required depends on the optimal convolution function called.|
 ||arm_convolve_s8()|CONV| None |4 * (ker_x * ker_y * input_ch + delta)| Yes | Yes |delta - MVE only|
 ||arm_convolve_1x1_s8_fast() | CONV | dilation = 1 <br/> ker_x = 1, ker_y = 1 <br/> pad = 0<br/> stride = 1<br/> input_ch % 4 = 0| No | Yes |Yes ||
-||arm_convolve_1_x_n_s8() | CONV | dilation = 1 <br/> output_y % 4 = 0 | Yes. Refer API for details |Yes |Yes|Not all implementations require additional memory|
+||arm_convolve_1_x_n_s8() | CONV | dilation = 1 <br/> output_y % 4 = 0 | Yes. Refer to API for details |Yes |Yes|Not all implementations require additional memory|
 ||arm_depthwise_conv_wrapper_s8()| DEPTHWISE_CONV | None |n.a.| Yes| Yes| The additional memory required depends on the optimal convolution function called|
 ||arm_depthwise_conv_3x3_s8() | DEPTHWISE_CONV | dilation = 1 <br/> depth_multiplier = 1 <br/> pad_x <= 1 | No|No|No| Preferred function for 3x3 kernel size for DSP extension. </br> For MVE, use arm_depthwise_conv_s8_opt()||
 ||arm_depthwise_conv_s8() | DEPTHWISE_CONV | None | No|No|No||
 ||arm_depthwise_conv_s8_opt()| DEPTHWISE_CONV | dilation = 1 <br/> depth_multiplier = 1 | DSP: 2 * ker_x * ker_y * input_ch <br/> MVE: 2 * DSP + 4 | Yes| Yes| Best case is when channels are multiple of 4 or <br/>at the least >= 4 |
 ||arm_convolve_wrapper_s16()|CONV|None|n.a.| Yes | No | The additional memory required depends on the optimal convolution function called |
 ||arm_convolve_s16()|CONV|None|No| No | No ||
-||arm_convolve_fast_s16()|CONV|dilation = 1, <br/> ker_x * ker_y * input_ch < 512 <br/> |4 * ker_x * ker_y * input_ch| Yes | No ||
-||arm_depthwise_conv_wrapper_s16() | DEPTHWISE_CONV | None | n.a. | Yes | No | The additional memory required depends on the optimal convolution function called |
-||arm_depthwise_conv_s16() | DEPTHWISE_CONV | None | No|No|No||
-||arm_depthwise_conv_fast_s16() | DEPTHWISE_CONV | Yes | 4 * ker_x * ker_y * input_ch | Yes | No ||
+||arm_convolve_fast_s16()|CONV|dilation = 1, <br/> ker_x * ker_y * input_ch < 512 <br/> |4 * ker_x * ker_y * input_ch| Yes | Yes ||
+||arm_depthwise_conv_wrapper_s16() | DEPTHWISE_CONV | None | n.a. | Yes | Yes | The additional memory required depends on the optimal convolution function called |
+||arm_depthwise_conv_s16() | DEPTHWISE_CONV | None | No | Yes ||
+||arm_depthwise_conv_fast_s16() | DEPTHWISE_CONV | Yes | Yes. Refer to API for details | Yes | Yes ||
 |[Fully Connected](https://arm-software.github.io/CMSIS_5/NN/html/group__FC.html)||||| |  | |
 ||arm_fully_connected_s8() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | Yes | |
 ||arm_fully_connected_s16() |FULLY CONNECTED & <br/> MAT MUL  | None | No | Yes | No | |

+ 154 - 18
CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_fast_s16.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,8 +22,8 @@
  * Description:  Optimized s16 depthwise separable convolution function for
  *               channel multiplier of 1.
  *
- * $Date:        May 19, 2022
- * $Revision:    V.1.0.0
+ * $Date:        6 July 2022
+ * $Revision:    V.1.1.0
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -69,7 +69,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
         return ARM_CMSIS_NN_ARG_ERROR;
     }
 
-    if (filter_dims->w * filter_dims->h * input_ch >= 512)
+    if (filter_dims->w * filter_dims->h >= 512)
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
@@ -78,10 +78,12 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
     {
         return ARM_CMSIS_NN_ARG_ERROR;
     }
-#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-    const int32_t input_batches = input_dims->n;
+
+#if defined(ARM_MATH_DSP)
+    (void)bias_dims;
     const int32_t input_x = input_dims->w;
     const int32_t input_y = input_dims->h;
+    const int32_t input_batches = input_dims->n;
     const int32_t kernel_x = filter_dims->w;
     const int32_t kernel_y = filter_dims->h;
     const int32_t pad_x = dw_conv_params->padding.w;
@@ -96,7 +98,124 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
     const int32_t output_activation_max = dw_conv_params->activation.max;
     q15_t *buffer_a = (q15_t *)ctx->buf;
 
-    (void)bias_dims;
+#if defined(ARM_MATH_MVEI)
+    int16_t *lhs_buffer = buffer_a;
+    int16_t *out = output;
+    int buffer_count = 0;
+    const int32_t kernel_size = kernel_x * kernel_y;
+
+    for (int i_batch = 0; i_batch < input_batches; i_batch++)
+    {
+        /* This part implements the im2col function */
+        for (int i_out_y = 0, base_idx_y = -pad_y; i_out_y < output_y; base_idx_y += stride_y, i_out_y++)
+        {
+            for (int i_out_x = 0, base_idx_x = -pad_x; i_out_x < output_x; base_idx_x += stride_x, i_out_x++)
+            {
+                for (int i_ker_y = base_idx_y; i_ker_y < base_idx_y + kernel_y; i_ker_y++)
+                {
+                    for (int i_ker_x = base_idx_x; i_ker_x < base_idx_x + kernel_x; i_ker_x++)
+                    {
+                        if (i_ker_y < 0 || i_ker_y >= input_y || i_ker_x < 0 || i_ker_x >= input_x)
+                        {
+                            memset(lhs_buffer, (int16_t)0, (uint32_t)(input_ch * sizeof(int16_t)));
+                        }
+                        else
+                        {
+                            arm_memcpy_q15(lhs_buffer,
+                                           (int16_t *)(input + (i_ker_y * input_x + i_ker_x) * input_ch),
+                                           (uint32_t)(input_ch * sizeof(int16_t)));
+                        }
+                        lhs_buffer += input_ch;
+                    }
+                }
+                buffer_count++;
+                if (buffer_count == 4)
+                {
+                    lhs_buffer = buffer_a;
+
+                    out = arm_nn_depthwise_conv_nt_t_s16(lhs_buffer,
+                                                         kernel,
+                                                         input_ch,
+                                                         output_shift,
+                                                         output_mult,
+                                                         output_activation_min,
+                                                         output_activation_max,
+                                                         kernel_size,
+                                                         bias,
+                                                         out);
+                    buffer_count = 0;
+                }
+            }
+        }
+        input += input_x * input_y * input_ch;
+    }
+
+    /* Handle left over buffers */
+    lhs_buffer = buffer_a;
+    for (int i_buf = 0; i_buf < buffer_count; i_buf++)
+    {
+        int32_t loop_count = (input_ch + 3) / 4;
+        int32_t num_ch_to_process = input_ch;
+
+        for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
+        {
+            const int8_t *row_0 = kernel + offset;
+            const int16_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
+
+            int32x4_t out_0 = vdupq_n_s32(0);
+
+            for (int i_ker = 0; i_ker < kernel_size; i_ker++)
+            {
+                const int32x4_t ker_0 = vldrbq_s32(row_0);
+
+                int32x4_t ip_0 = vldrhq_s32(col_0);
+                out_0 += vmulq_s32(ip_0, ker_0);
+
+                col_0 += input_ch;
+                row_0 += input_ch;
+            }
+
+            int64_t in_requantize_0 = (int64_t)out_0[0];
+            int64_t in_requantize_1 = (int64_t)out_0[1];
+            int64_t in_requantize_2 = (int64_t)out_0[2];
+            int64_t in_requantize_3 = (int64_t)out_0[3];
+
+            if (bias)
+            {
+                in_requantize_0 += bias[offset];
+                in_requantize_1 += bias[offset + 1];
+                in_requantize_2 += bias[offset + 2];
+                in_requantize_3 += bias[offset + 3];
+            }
+
+            int32_t reduced_multiplier_0 = REDUCE_MULTIPLIER(output_mult[offset]);
+            int32_t reduced_multiplier_1 = REDUCE_MULTIPLIER(output_mult[offset + 1]);
+            int32_t reduced_multiplier_2 = REDUCE_MULTIPLIER(output_mult[offset + 2]);
+            int32_t reduced_multiplier_3 = REDUCE_MULTIPLIER(output_mult[offset + 3]);
+
+            out_0[0] = arm_nn_requantize_s64(in_requantize_0, reduced_multiplier_0, output_shift[offset]);
+            out_0[1] = arm_nn_requantize_s64(in_requantize_1, reduced_multiplier_1, output_shift[offset + 1]);
+            out_0[2] = arm_nn_requantize_s64(in_requantize_2, reduced_multiplier_2, output_shift[offset + 2]);
+            out_0[3] = arm_nn_requantize_s64(in_requantize_3, reduced_multiplier_3, output_shift[offset + 3]);
+
+            out_0 = vmaxq_s32(out_0, vdupq_n_s32(output_activation_min));
+            out_0 = vminq_s32(out_0, vdupq_n_s32(output_activation_max));
+
+            mve_pred16_t p = vctp32q((uint32_t)num_ch_to_process);
+            vstrhq_p_s32(out, out_0, p);
+
+            out += 4;
+        }
+
+        const int tail_ch = input_ch & 0x3;
+        if (tail_ch != 0)
+        {
+            out -= (4 - tail_ch);
+        }
+    }
+
+#else // ARM_MATH_DSP
+
     /* Run the following code in cores using DSP extension */
     q15_t *const col_buffer_start = buffer_a;
     q15_t *col_buffer = col_buffer_start;
@@ -143,9 +262,9 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                         }
                         else
                         {
-                            memcpy(&col_buffer[index],
-                                   input + (idx_y * input_x + idx_x) * input_ch,
-                                   input_ch * sizeof(q15_t));
+                            arm_memcpy_q15(&col_buffer[index],
+                                           input + (idx_y * input_x + idx_x) * input_ch,
+                                           input_ch * sizeof(q15_t));
                         }
                         index += input_ch;
                     }
@@ -237,10 +356,18 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                         col_count--;
                     }
 
-                    q63_t acc_1 = *bias++ + sum_1;
-                    q63_t acc_2 = *bias++ + sum_2;
-                    q63_t acc_3 = *bias++ + sum_3;
-                    q63_t acc_4 = *bias++ + sum_4;
+                    int64_t acc_1 = sum_1;
+                    int64_t acc_2 = sum_2;
+                    int64_t acc_3 = sum_3;
+                    int64_t acc_4 = sum_4;
+
+                    if (bias)
+                    {
+                        acc_1 += *bias++;
+                        acc_2 += *bias++;
+                        acc_3 += *bias++;
+                        acc_4 += *bias++;
+                    }
 
                     result = arm_nn_requantize_s64(acc_1, output_mult_1, *output_shift++);
                     result = MAX(result, output_activation_min);
@@ -278,7 +405,11 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
                     {
                         sum += row_pos[i * input_ch] * col_pos[i * input_ch];
                     }
-                    q63_t acc = *bias++ + sum;
+                    int64_t acc = sum;
+                    if (bias)
+                    {
+                        acc += *bias++;
+                    }
                     result = arm_nn_requantize_s64(acc, REDUCE_MULTIPLIER(*output_mult), *output_shift++);
                     output_mult++;
                     result = MAX(result, output_activation_min);
@@ -287,7 +418,6 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
 
                     row_count--;
                 }
-
                 // clear counter and pointers
                 col_buffer = col_buffer_start;
             }
@@ -296,6 +426,7 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
         /* Advance to the next batch */
         input += (input_x * input_y * input_ch);
     }
+#endif
 #else
     /* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
     return arm_depthwise_conv_s16(ctx,
@@ -317,8 +448,13 @@ arm_cmsis_nn_status arm_depthwise_conv_fast_s16(const cmsis_nn_context *ctx,
 
 int32_t arm_depthwise_conv_fast_s16_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 {
-#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_MVEI)
-    return ((input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t));
+#if defined(ARM_MATH_DSP)
+#if defined(ARM_MATH_MVEI)
+    /* The + 8 accounts for a worst case out of bounds read of the lhs buffers in the *_nt_t_* function.  */
+    return 4 * input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t) + 8;
+#else // ARM_MATH_DSP
+    return input_dims->c * filter_dims->w * filter_dims->h * sizeof(int16_t);
+#endif
 #else
     (void)input_dims;
     (void)filter_dims;

+ 3 - 4
CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -21,8 +21,8 @@
  * Title:        arm_depthwise_conv_s8.c
  * Description:  s8 version of depthwise convolution.
  *
- * $Date:        9. May 2022
- * $Revision:    V.3.0.1
+ * $Date:        6 July 2022
+ * $Revision:    V.3.0.2
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -281,7 +281,6 @@ arm_cmsis_nn_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
     const uint16_t dilation_x = dw_conv_params->dilation.w;
     const uint16_t dilation_y = dw_conv_params->dilation.h;
 
-    (void)dw_conv_params->dilation;
     (void)bias_dims;
     (void)ctx;
 

+ 6 - 7
CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_s8_opt.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,8 +22,8 @@
  * Description:  Optimized s8 depthwise separable convolution function for
  *               channel multiplier of 1.
  *
- * $Date:        19 April 2022
- * $Revision:    V.3.0.0
+ * $Date:        6 July 2022
+ * $Revision:    V.3.0.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -75,6 +75,7 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
         return ARM_CMSIS_NN_ARG_ERROR;
     }
 #ifdef ARM_MATH_DSP
+    (void)bias_dims;
     const int32_t input_x = input_dims->w;
     const int32_t input_y = input_dims->h;
     const int32_t kernel_x = filter_dims->w;
@@ -94,7 +95,6 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
     q15_t *buffer_a = (q15_t *)ctx->buf;
 
 #ifdef ARM_MATH_MVEI
-    (void)bias_dims;
     /* Generate two columns from the input tensor */
     q7_t *lhs_buffer = (q7_t *)buffer_a;
     q7_t *out = output;
@@ -170,8 +170,8 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
     for (int i_buf = 0; i_buf < buffer_count; i_buf++)
     {
         int32_t loop_count = (input_ch + 3) / 4;
-
         int32_t num_ch_to_process = input_ch;
+
         for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count; num_ch_to_process -= 4, offset += 4, i_loop_cnt++)
         {
             const int8_t *col_0 = lhs_buffer + (kernel_size * input_ch * i_buf) + offset;
@@ -211,7 +211,6 @@ arm_cmsis_nn_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
     }
 
 #else // ARM_MATH_DSP
-    (void)bias_dims;
     /* Run the following code in cores using DSP extension */
     q15_t *const col_buffer_start = buffer_a;
     q15_t *col_buffer = col_buffer_start;
@@ -418,7 +417,7 @@ int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dim
 {
 #if defined(ARM_MATH_MVEI)
     /* The + 4 accounts for out of bounds read of the lhs buffers in the *_nt_t_* functions.  */
-    return (2 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int16_t) + 4;
+    return (4 * input_dims->c * filter_dims->w * filter_dims->h) * (int32_t)sizeof(int8_t) + 4;
 #elif defined(ARM_MATH_DSP)
     return (input_dims->c * filter_dims->w * filter_dims->h) * sizeof(int16_t);
 #else

+ 9 - 7
CMSIS/NN/Source/ConvolutionFunctions/arm_depthwise_conv_wrapper_s16.c

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2010-2022 Arm Limited or its affiliates.
+ * SPDX-FileCopyrightText: Copyright 2010-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -22,8 +22,8 @@
  * Description:  Wrapper API to select appropriate depthwise conv API based
  *               on dimensions.
  *
- * $Date:        19. May 2022
- * $Revision:    V.1.0.0
+ * $Date:        6 July 2022
+ * $Revision:    V.1.0.1
  *
  * Target Processor:  Cortex-M CPUs
  *
@@ -40,6 +40,10 @@
  * @{
  */
 
+#define USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims)                                             \
+    (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 &&             \
+     filter_dims->w * filter_dims->h * input_dims->c < 512)
+
 /*
  *  s16 Depthwise conv wrapper function
  *
@@ -60,8 +64,7 @@ arm_cmsis_nn_status arm_depthwise_conv_wrapper_s16(const cmsis_nn_context *ctx,
 {
     arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS;
 
-    if (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 &&
-        filter_dims->w * filter_dims->h * input_dims->c < 512)
+    if (USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims))
     {
         status = arm_depthwise_conv_fast_s16(ctx,
                                              dw_conv_params,
@@ -105,8 +108,7 @@ int32_t arm_depthwise_conv_wrapper_s16_get_buffer_size(const cmsis_nn_dw_conv_pa
     (void)output_dims;
     int32_t size = 0;
 
-    if (dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 &&
-        filter_dims->w * filter_dims->h * input_dims->c < 512)
+    if (USE_FAST_DW_CONV_FUNCTION(dw_conv_params, filter_dims, input_dims))
     {
         size = arm_depthwise_conv_fast_s16_get_buffer_size(input_dims, filter_dims);
     }

+ 3 - 2
CMSIS/NN/Source/NNSupportFunctions/CMakeLists.txt

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2022 Arm Limited.
+# SPDX-FileCopyrightText: Copyright 2019-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -22,5 +22,6 @@ target_sources(cmsis-nn PRIVATE ${SRC} arm_q7_to_q15_with_offset.c
                                        arm_q7_to_q15_with_offset.c
                                        arm_nn_mat_mul_kernel_s16.c
                                        arm_nn_vec_mat_mult_t_s16.c
-                                       arm_q7_to_q15_no_shift.c)
+                                       arm_q7_to_q15_no_shift.c
+                                       arm_nn_depthwise_conv_nt_t_s16.c)
 

+ 171 - 0
CMSIS/NN/Source/NNSupportFunctions/arm_nn_depthwise_conv_nt_t_s16.c

@@ -0,0 +1,171 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_nn_depthwise_conv_nt_t_s16.c
+ * Description:  Depthwise convolution on matrices with no padding.
+ *
+ * $Date:        6 July 2022
+ * $Revision:    V.1.0.0
+ *
+ * Target Processor:  Cortex-M processors with MVE extension
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnsupportfunctions.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup NNBasicMath
+ * @{
+ */
+
+/*
+ * Depthwise convolution of rhs matrix with 4 lhs matrices with no padding. Dimensions are the same for lhs and rhs.
+ *
+ * Refer header file for details.
+ *
+ */
+int16_t *arm_nn_depthwise_conv_nt_t_s16(const int16_t *lhs,
+                                        const q7_t *rhs,
+                                        const uint16_t num_ch,
+                                        const int32_t *out_shift,
+                                        const int32_t *out_mult,
+                                        const int32_t activation_min,
+                                        const int32_t activation_max,
+                                        const uint16_t row_x_col,
+                                        const int64_t *const output_bias,
+                                        int16_t *out)
+{
+#if defined(ARM_MATH_MVEI)
+
+    const int64_t *bias = output_bias;
+    int32_t loop_count = (num_ch + 3) / 4;
+    uint32_t num_ch_to_process = num_ch;
+
+    for (int i_loop_cnt = 0, offset = 0; i_loop_cnt < loop_count;
+         num_ch_to_process -= 4, offset += 4, out += 4, i_loop_cnt++)
+    {
+        const int8_t *rhs_0 = rhs + offset;
+        const int16_t *lhs_0 = lhs + offset;
+        const int16_t *lhs_1 = lhs + row_x_col * num_ch + offset;
+        const int16_t *lhs_2 = lhs + (row_x_col * num_ch * 2) + offset;
+        const int16_t *lhs_3 = lhs + (row_x_col * num_ch * 3) + offset;
+
+        int32x4_t out_0 = vdupq_n_s32(0);
+        int32x4_t out_1 = vdupq_n_s32(0);
+        int32x4_t out_2 = vdupq_n_s32(0);
+        int32x4_t out_3 = vdupq_n_s32(0);
+
+        for (int i_row_x_col = 0; i_row_x_col < row_x_col; i_row_x_col++)
+        {
+            const int32x4_t ker_0 = vldrbq_s32(rhs_0);
+
+            int32x4_t ip_0 = vldrhq_s32(lhs_0);
+            out_0 += vmulq_s32(ip_0, ker_0);
+
+            int32x4_t ip_1 = vldrhq_s32(lhs_1);
+            out_1 += vmulq_s32(ip_1, ker_0);
+
+            int32x4_t ip_2 = vldrhq_s32(lhs_2);
+            out_2 += vmulq_s32(ip_2, ker_0);
+
+            int32x4_t ip_3 = vldrhq_s32(lhs_3);
+            out_3 += vmulq_s32(ip_3, ker_0);
+
+            lhs_0 += num_ch;
+            lhs_1 += num_ch;
+            lhs_2 += num_ch;
+            lhs_3 += num_ch;
+
+            rhs_0 += num_ch;
+        }
+
+        for (int i_requantize = 0; i_requantize < 4; i_requantize++)
+        {
+            int32_t reduced_multiplier = REDUCE_MULTIPLIER(out_mult[i_requantize]);
+            int32_t shift = out_shift[i_requantize];
+            int64_t in_requantize_0 = (int64_t)out_0[i_requantize];
+            int64_t in_requantize_1 = (int64_t)out_1[i_requantize];
+            int64_t in_requantize_2 = (int64_t)out_2[i_requantize];
+            int64_t in_requantize_3 = (int64_t)out_3[i_requantize];
+
+            if (bias)
+            {
+                in_requantize_0 += *bias;
+                in_requantize_1 += *bias;
+                in_requantize_2 += *bias;
+                in_requantize_3 += *bias;
+                bias++;
+            }
+
+            out_0[i_requantize] = arm_nn_requantize_s64(in_requantize_0, reduced_multiplier, shift);
+            out_1[i_requantize] = arm_nn_requantize_s64(in_requantize_1, reduced_multiplier, shift);
+            out_2[i_requantize] = arm_nn_requantize_s64(in_requantize_2, reduced_multiplier, shift);
+            out_3[i_requantize] = arm_nn_requantize_s64(in_requantize_3, reduced_multiplier, shift);
+        }
+
+        mve_pred16_t p = vctp32q(num_ch_to_process);
+
+        out_0 = vmaxq_s32(out_0, vdupq_n_s32(activation_min));
+        out_0 = vminq_s32(out_0, vdupq_n_s32(activation_max));
+        vstrhq_p_s32(out, out_0, p);
+
+        out_1 = vmaxq_s32(out_1, vdupq_n_s32(activation_min));
+        out_1 = vminq_s32(out_1, vdupq_n_s32(activation_max));
+        vstrhq_p_s32(out + num_ch, out_1, p);
+
+        out_2 = vmaxq_s32(out_2, vdupq_n_s32(activation_min));
+        out_2 = vminq_s32(out_2, vdupq_n_s32(activation_max));
+        vstrhq_p_s32(out + 2 * num_ch, out_2, p);
+
+        out_3 = vmaxq_s32(out_3, vdupq_n_s32(activation_min));
+        out_3 = vminq_s32(out_3, vdupq_n_s32(activation_max));
+        vstrhq_p_s32(out + 3 * num_ch, out_3, p);
+
+        out_mult += 4;
+        out_shift += 4;
+    }
+    const int tail_ch = num_ch & 0x3;
+    if (tail_ch != 0)
+    {
+        out -= (4 - tail_ch);
+    }
+
+    return out + (3 * num_ch);
+#else
+    (void)lhs;
+    (void)rhs;
+    (void)num_ch;
+    (void)out_shift;
+    (void)out_mult;
+    (void)activation_min;
+    (void)activation_max;
+    (void)row_x_col;
+    (void)output_bias;
+    (void)out;
+    return NULL;
+#endif
+}
+
+/**
+ * @} end of NNBasicMath group
+ */

+ 2 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers/bias.txt

@@ -0,0 +1,2 @@
+# 8
+-2.210000000000000000e+04,2.788100000000000000e+04,5.096000000000000000e+03,1.022600000000000000e+04,-5.822000000000000000e+03,2.018300000000000000e+04,1.650100000000000000e+04,3.081800000000000000e+04

+ 76 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers/input.txt

@@ -0,0 +1,76 @@
+# 3,5,5,8
+4.514000000000000000e+03,-6.264000000000000000e+03,2.243700000000000000e+04,-3.008100000000000000e+04,-3.273100000000000000e+04,2.333300000000000000e+04,-6.303000000000000000e+03,-2.871800000000000000e+04
+-2.343600000000000000e+04,-1.999900000000000000e+04,-1.245500000000000000e+04,-3.187500000000000000e+04,-9.662000000000000000e+03,2.847600000000000000e+04,2.018200000000000000e+04,1.410500000000000000e+04
+-1.251500000000000000e+04,2.549100000000000000e+04,1.778100000000000000e+04,-1.518500000000000000e+04,-1.918700000000000000e+04,-1.176000000000000000e+04,-1.262300000000000000e+04,-9.167000000000000000e+03
+4.279000000000000000e+03,3.262800000000000000e+04,-2.558000000000000000e+04,6.303000000000000000e+03,-2.620900000000000000e+04,-2.894600000000000000e+04,2.036000000000000000e+03,8.196000000000000000e+03
+1.744400000000000000e+04,4.500000000000000000e+03,3.847000000000000000e+03,-1.603900000000000000e+04,2.864200000000000000e+04,-8.731000000000000000e+03,2.214700000000000000e+04,-7.940000000000000000e+02
+1.888300000000000000e+04,1.088900000000000000e+04,1.200000000000000000e+01,-6.815000000000000000e+03,-8.290000000000000000e+02,-1.721700000000000000e+04,8.164000000000000000e+03,1.928300000000000000e+04
+-2.161300000000000000e+04,1.753800000000000000e+04,2.129300000000000000e+04,-5.966000000000000000e+03,-2.775300000000000000e+04,9.093000000000000000e+03,1.348700000000000000e+04,-4.789000000000000000e+03
+6.998000000000000000e+03,2.703700000000000000e+04,1.715200000000000000e+04,-2.432600000000000000e+04,1.038900000000000000e+04,-2.371600000000000000e+04,2.167200000000000000e+04,-1.288400000000000000e+04
+-2.688400000000000000e+04,1.299200000000000000e+04,-1.624500000000000000e+04,1.101400000000000000e+04,2.925000000000000000e+04,2.592200000000000000e+04,3.946000000000000000e+03,-2.434300000000000000e+04
+1.905200000000000000e+04,1.215300000000000000e+04,-2.502200000000000000e+04,4.532000000000000000e+03,1.952300000000000000e+04,1.816600000000000000e+04,-5.593000000000000000e+03,-6.730000000000000000e+03
+-2.578800000000000000e+04,1.277100000000000000e+04,7.080000000000000000e+02,-2.009900000000000000e+04,2.264500000000000000e+04,1.158800000000000000e+04,-1.649000000000000000e+04,-7.065000000000000000e+03
+-1.201500000000000000e+04,1.563400000000000000e+04,3.052600000000000000e+04,2.827700000000000000e+04,2.475200000000000000e+04,2.716800000000000000e+04,1.192100000000000000e+04,1.429800000000000000e+04
+3.233800000000000000e+04,3.118500000000000000e+04,-2.666700000000000000e+04,-2.105600000000000000e+04,-3.028200000000000000e+04,-6.235000000000000000e+03,-1.259000000000000000e+04,2.642000000000000000e+03
+-1.683300000000000000e+04,4.775000000000000000e+03,-1.608000000000000000e+04,1.870800000000000000e+04,3.449000000000000000e+03,1.212600000000000000e+04,2.694100000000000000e+04,2.913100000000000000e+04
+2.495000000000000000e+04,-1.299100000000000000e+04,1.392700000000000000e+04,-3.011200000000000000e+04,-2.297600000000000000e+04,1.987200000000000000e+04,1.563300000000000000e+04,1.548200000000000000e+04
+8.058000000000000000e+03,-1.158200000000000000e+04,1.227000000000000000e+03,2.826200000000000000e+04,2.193500000000000000e+04,9.733000000000000000e+03,-2.871300000000000000e+04,4.935000000000000000e+03
+-2.576200000000000000e+04,-2.397300000000000000e+04,1.320800000000000000e+04,-7.388000000000000000e+03,1.359800000000000000e+04,1.701300000000000000e+04,1.955000000000000000e+04,2.713100000000000000e+04
+-1.161900000000000000e+04,8.183000000000000000e+03,3.195900000000000000e+04,-1.848100000000000000e+04,-1.434300000000000000e+04,3.028800000000000000e+04,-2.797400000000000000e+04,-1.886000000000000000e+03
+1.253300000000000000e+04,1.497500000000000000e+04,6.799000000000000000e+03,3.206300000000000000e+04,2.312000000000000000e+04,1.855300000000000000e+04,6.933000000000000000e+03,-1.200000000000000000e+04
+1.642900000000000000e+04,-2.818100000000000000e+04,1.113100000000000000e+04,7.045000000000000000e+03,8.840000000000000000e+03,-1.051100000000000000e+04,-2.140400000000000000e+04,3.026200000000000000e+04
+1.024100000000000000e+04,-2.046500000000000000e+04,-1.597600000000000000e+04,-4.000000000000000000e+00,-2.297300000000000000e+04,-9.300000000000000000e+02,3.203800000000000000e+04,1.851000000000000000e+03
+-3.240600000000000000e+04,6.362000000000000000e+03,-1.218500000000000000e+04,-2.622700000000000000e+04,3.135300000000000000e+04,-1.769800000000000000e+04,2.027300000000000000e+04,1.817200000000000000e+04
+-1.461800000000000000e+04,-9.049000000000000000e+03,2.663800000000000000e+04,3.172200000000000000e+04,4.500000000000000000e+01,-8.207000000000000000e+03,-7.542000000000000000e+03,3.025600000000000000e+04
+3.977000000000000000e+03,-3.803000000000000000e+03,-6.135000000000000000e+03,-6.511000000000000000e+03,-7.829000000000000000e+03,4.396000000000000000e+03,-2.836100000000000000e+04,2.324900000000000000e+04
+2.127000000000000000e+03,-1.956900000000000000e+04,1.510700000000000000e+04,-2.167900000000000000e+04,3.223400000000000000e+04,8.862000000000000000e+03,1.517200000000000000e+04,2.059700000000000000e+04
+2.222000000000000000e+03,3.178100000000000000e+04,-1.672400000000000000e+04,-1.302300000000000000e+04,8.696000000000000000e+03,5.212000000000000000e+03,-3.036500000000000000e+04,7.750000000000000000e+03
+7.714000000000000000e+03,-4.491000000000000000e+03,1.646400000000000000e+04,2.101400000000000000e+04,-2.145800000000000000e+04,-3.191000000000000000e+04,7.383000000000000000e+03,-3.010600000000000000e+04
+2.700700000000000000e+04,3.540000000000000000e+02,-2.871900000000000000e+04,-2.181300000000000000e+04,2.960000000000000000e+03,2.215900000000000000e+04,1.061900000000000000e+04,3.099000000000000000e+04
+-2.177200000000000000e+04,-1.718300000000000000e+04,-2.527200000000000000e+04,3.563000000000000000e+03,4.941000000000000000e+03,1.212300000000000000e+04,2.685000000000000000e+03,-2.343200000000000000e+04
+-2.764700000000000000e+04,2.378000000000000000e+04,1.741900000000000000e+04,-2.472300000000000000e+04,-2.712100000000000000e+04,-4.803000000000000000e+03,-4.830000000000000000e+02,1.208000000000000000e+03
+-1.347500000000000000e+04,-1.725100000000000000e+04,1.942000000000000000e+04,-7.826000000000000000e+03,1.706600000000000000e+04,7.540000000000000000e+03,2.244800000000000000e+04,2.667000000000000000e+03
+-1.480700000000000000e+04,8.150000000000000000e+03,-2.688400000000000000e+04,-7.742000000000000000e+03,1.174700000000000000e+04,-3.039800000000000000e+04,-1.337000000000000000e+04,1.865300000000000000e+04
+2.316700000000000000e+04,3.057800000000000000e+04,1.084400000000000000e+04,2.910100000000000000e+04,9.598000000000000000e+03,3.146200000000000000e+04,-5.056000000000000000e+03,2.403800000000000000e+04
+-8.900000000000000000e+03,-3.215800000000000000e+04,-2.467900000000000000e+04,-2.366400000000000000e+04,7.600000000000000000e+03,-1.148400000000000000e+04,1.045400000000000000e+04,2.372400000000000000e+04
+1.777500000000000000e+04,2.612000000000000000e+03,-1.303100000000000000e+04,-5.835000000000000000e+03,2.091000000000000000e+04,-3.078900000000000000e+04,-3.334000000000000000e+03,5.527000000000000000e+03
+-1.682600000000000000e+04,-2.339400000000000000e+04,-2.831000000000000000e+03,-3.130400000000000000e+04,1.784100000000000000e+04,8.784000000000000000e+03,4.300000000000000000e+03,-2.415500000000000000e+04
+2.094000000000000000e+03,3.747000000000000000e+03,2.542000000000000000e+04,1.697700000000000000e+04,2.077900000000000000e+04,1.567000000000000000e+04,2.185500000000000000e+04,-2.615900000000000000e+04
+-2.394100000000000000e+04,-2.778400000000000000e+04,-2.540000000000000000e+02,-5.185000000000000000e+03,-9.459000000000000000e+03,-2.573000000000000000e+04,1.634400000000000000e+04,1.041800000000000000e+04
+-2.287500000000000000e+04,1.527300000000000000e+04,-3.263000000000000000e+04,-1.748300000000000000e+04,-2.253800000000000000e+04,-2.142200000000000000e+04,-8.329000000000000000e+03,1.048800000000000000e+04
+1.116500000000000000e+04,-2.564000000000000000e+03,5.247000000000000000e+03,2.881400000000000000e+04,3.213000000000000000e+03,-2.369200000000000000e+04,-3.040700000000000000e+04,1.849000000000000000e+03
+-1.858800000000000000e+04,-2.219800000000000000e+04,-7.200000000000000000e+03,-6.210000000000000000e+03,-1.432900000000000000e+04,1.373700000000000000e+04,-3.413000000000000000e+03,-2.495500000000000000e+04
+1.435200000000000000e+04,-1.234900000000000000e+04,-3.199000000000000000e+03,2.314600000000000000e+04,-2.587000000000000000e+04,2.794700000000000000e+04,1.449800000000000000e+04,1.671000000000000000e+03
+-1.506300000000000000e+04,1.331000000000000000e+03,-7.660000000000000000e+03,1.660000000000000000e+03,-1.490700000000000000e+04,-1.759400000000000000e+04,2.878100000000000000e+04,1.917000000000000000e+04
+-2.697200000000000000e+04,1.394100000000000000e+04,2.788800000000000000e+04,-2.080700000000000000e+04,8.100000000000000000e+03,2.066300000000000000e+04,4.881000000000000000e+03,-4.664000000000000000e+03
+2.464800000000000000e+04,-1.534800000000000000e+04,2.698800000000000000e+04,3.764000000000000000e+03,7.064000000000000000e+03,-1.969800000000000000e+04,2.866200000000000000e+04,4.967000000000000000e+03
+-3.144700000000000000e+04,-3.990000000000000000e+03,1.319000000000000000e+03,1.021600000000000000e+04,-2.404000000000000000e+03,5.452000000000000000e+03,-6.171000000000000000e+03,1.610500000000000000e+04
+2.878400000000000000e+04,-2.354000000000000000e+04,-6.038000000000000000e+03,-2.026700000000000000e+04,6.141000000000000000e+03,2.268800000000000000e+04,-1.863700000000000000e+04,1.649700000000000000e+04
+-2.558900000000000000e+04,-2.687700000000000000e+04,8.084000000000000000e+03,2.963000000000000000e+03,1.295600000000000000e+04,-3.018000000000000000e+03,-1.346300000000000000e+04,-2.595900000000000000e+04
+7.863000000000000000e+03,-3.155300000000000000e+04,7.655000000000000000e+03,-2.254400000000000000e+04,-2.389100000000000000e+04,-2.207600000000000000e+04,-5.237000000000000000e+03,3.099400000000000000e+04
+2.414200000000000000e+04,2.141400000000000000e+04,-2.080900000000000000e+04,-1.792300000000000000e+04,-9.170000000000000000e+02,-2.898700000000000000e+04,-6.960000000000000000e+03,-1.285000000000000000e+03
+1.408800000000000000e+04,1.873100000000000000e+04,6.776000000000000000e+03,-5.997000000000000000e+03,-3.200000000000000000e+02,1.353200000000000000e+04,7.767000000000000000e+03,3.178100000000000000e+04
+-8.852000000000000000e+03,5.386000000000000000e+03,2.109900000000000000e+04,-2.248000000000000000e+03,-2.158000000000000000e+04,-2.943000000000000000e+03,1.878200000000000000e+04,-2.715800000000000000e+04
+-1.761000000000000000e+03,-3.015600000000000000e+04,2.276900000000000000e+04,-2.124900000000000000e+04,2.452400000000000000e+04,3.117500000000000000e+04,-2.082200000000000000e+04,-9.223000000000000000e+03
+6.151000000000000000e+03,-1.555200000000000000e+04,2.475600000000000000e+04,2.379500000000000000e+04,4.022000000000000000e+03,1.985300000000000000e+04,-9.712000000000000000e+03,3.043000000000000000e+04
+-1.239100000000000000e+04,-4.296000000000000000e+03,-8.929000000000000000e+03,2.153700000000000000e+04,1.479600000000000000e+04,2.714200000000000000e+04,-1.042400000000000000e+04,-1.658100000000000000e+04
+-3.041000000000000000e+03,-1.354400000000000000e+04,-2.699200000000000000e+04,-1.889000000000000000e+04,-2.246200000000000000e+04,-6.774000000000000000e+03,1.162800000000000000e+04,1.018000000000000000e+03
+-2.562100000000000000e+04,1.835300000000000000e+04,-1.790800000000000000e+04,-1.992200000000000000e+04,2.628500000000000000e+04,1.079000000000000000e+04,2.465500000000000000e+04,2.280200000000000000e+04
+9.859000000000000000e+03,1.566000000000000000e+04,2.064300000000000000e+04,2.626800000000000000e+04,-2.352400000000000000e+04,-2.389500000000000000e+04,-1.138800000000000000e+04,-1.169800000000000000e+04
+2.240800000000000000e+04,-2.975400000000000000e+04,-1.048800000000000000e+04,-3.039800000000000000e+04,1.322500000000000000e+04,1.075500000000000000e+04,-2.631600000000000000e+04,2.310200000000000000e+04
+2.731100000000000000e+04,3.065900000000000000e+04,-3.204500000000000000e+04,-2.260000000000000000e+03,-1.119500000000000000e+04,2.319800000000000000e+04,2.089400000000000000e+04,9.002000000000000000e+03
+2.227000000000000000e+04,1.253800000000000000e+04,2.569000000000000000e+04,1.269700000000000000e+04,-1.738700000000000000e+04,3.174000000000000000e+04,-7.549000000000000000e+03,-2.576000000000000000e+03
+-1.084000000000000000e+04,1.429000000000000000e+04,1.577300000000000000e+04,1.222000000000000000e+04,-9.868000000000000000e+03,-3.053100000000000000e+04,1.334000000000000000e+03,8.932000000000000000e+03
+1.043100000000000000e+04,2.807400000000000000e+04,1.510400000000000000e+04,-4.137000000000000000e+03,1.830100000000000000e+04,3.255200000000000000e+04,-4.026000000000000000e+03,-1.938800000000000000e+04
+-1.137000000000000000e+04,3.080100000000000000e+04,-2.192800000000000000e+04,-3.470000000000000000e+03,2.481200000000000000e+04,9.177000000000000000e+03,-2.041200000000000000e+04,-3.033200000000000000e+04
+-2.986000000000000000e+03,-3.125800000000000000e+04,-1.117000000000000000e+04,2.069100000000000000e+04,1.648200000000000000e+04,3.247800000000000000e+04,-6.432000000000000000e+03,2.083300000000000000e+04
+5.603000000000000000e+03,-7.879000000000000000e+03,1.672100000000000000e+04,3.131100000000000000e+04,4.882000000000000000e+03,-1.684300000000000000e+04,2.834900000000000000e+04,2.640300000000000000e+04
+1.821000000000000000e+03,1.774500000000000000e+04,1.594100000000000000e+04,-3.224100000000000000e+04,-3.146400000000000000e+04,3.107200000000000000e+04,-8.820000000000000000e+02,-7.897000000000000000e+03
+1.638500000000000000e+04,-6.740000000000000000e+02,-1.685800000000000000e+04,-1.399200000000000000e+04,-1.724000000000000000e+04,-3.076500000000000000e+04,2.556800000000000000e+04,2.208500000000000000e+04
+-1.887700000000000000e+04,-1.502500000000000000e+04,-2.517500000000000000e+04,-4.546000000000000000e+03,1.094100000000000000e+04,-1.128500000000000000e+04,-2.770700000000000000e+04,1.275400000000000000e+04
+-1.500400000000000000e+04,-6.357000000000000000e+03,1.033600000000000000e+04,-2.184600000000000000e+04,2.503900000000000000e+04,-1.229400000000000000e+04,2.955300000000000000e+04,1.889200000000000000e+04
+3.053000000000000000e+03,1.469800000000000000e+04,3.240400000000000000e+04,-2.582500000000000000e+04,-2.887000000000000000e+03,1.956400000000000000e+04,-2.360100000000000000e+04,1.285500000000000000e+04
+-1.167400000000000000e+04,-4.663000000000000000e+03,1.953200000000000000e+04,-1.115700000000000000e+04,1.580000000000000000e+04,1.722500000000000000e+04,2.870100000000000000e+04,1.020000000000000000e+04
+2.647200000000000000e+04,-1.569100000000000000e+04,1.359100000000000000e+04,-2.531000000000000000e+03,1.400600000000000000e+04,-1.831200000000000000e+04,2.749700000000000000e+04,1.140200000000000000e+04
+2.788900000000000000e+04,-2.334800000000000000e+04,-1.268800000000000000e+04,-2.863900000000000000e+04,2.965300000000000000e+04,4.195000000000000000e+03,2.732700000000000000e+04,-1.248600000000000000e+04
+2.089000000000000000e+04,5.154000000000000000e+03,2.382400000000000000e+04,1.703200000000000000e+04,2.615400000000000000e+04,1.686300000000000000e+04,3.159600000000000000e+04,1.861700000000000000e+04

+ 73 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers/kernel.txt

@@ -0,0 +1,73 @@
+# 3,3,8,1
+-3.212100000000000000e+04
+8.822000000000000000e+03
+-5.729000000000000000e+03
+-7.454000000000000000e+03
+-1.075400000000000000e+04
+-1.434300000000000000e+04
+2.789100000000000000e+04
+7.376000000000000000e+03
+-2.298200000000000000e+04
+-2.320400000000000000e+04
+3.272800000000000000e+04
+-1.930600000000000000e+04
+7.724000000000000000e+03
+5.890000000000000000e+02
+-7.187000000000000000e+03
+1.671000000000000000e+04
+2.230700000000000000e+04
+2.753400000000000000e+04
+-2.766500000000000000e+04
+1.121400000000000000e+04
+-5.097000000000000000e+03
+9.728000000000000000e+03
+-2.516600000000000000e+04
+-1.773200000000000000e+04
+-1.469600000000000000e+04
+3.230000000000000000e+02
+1.567900000000000000e+04
+-1.057300000000000000e+04
+-6.110000000000000000e+02
+-2.566300000000000000e+04
+-1.308300000000000000e+04
+-1.943000000000000000e+04
+-2.294300000000000000e+04
+1.573000000000000000e+03
+-2.091600000000000000e+04
+3.089500000000000000e+04
+-4.434000000000000000e+03
+-6.590000000000000000e+03
+-4.870000000000000000e+02
+-3.074900000000000000e+04
+1.456000000000000000e+03
+-9.657000000000000000e+03
+-6.790000000000000000e+02
+-2.608300000000000000e+04
+-7.920000000000000000e+02
+-7.298000000000000000e+03
+-2.479700000000000000e+04
+-3.030300000000000000e+04
+-3.053400000000000000e+04
+-2.850100000000000000e+04
+-3.049200000000000000e+04
+-4.336000000000000000e+03
+-2.556000000000000000e+04
+-1.524100000000000000e+04
+1.065200000000000000e+04
+1.129800000000000000e+04
+-3.019300000000000000e+04
+-2.056700000000000000e+04
+7.330000000000000000e+03
+1.836800000000000000e+04
+-1.734600000000000000e+04
+5.206000000000000000e+03
+1.378300000000000000e+04
+3.211300000000000000e+04
+-2.714100000000000000e+04
+-3.112600000000000000e+04
+-3.243900000000000000e+04
+-1.042400000000000000e+04
+3.044100000000000000e+04
+-2.847000000000000000e+03
+1.623400000000000000e+04
+1.445200000000000000e+04

+ 49 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/input.txt

@@ -0,0 +1,49 @@
+# 3,4,4,8
+1.522800000000000000e+04,9.887000000000000000e+03,-1.437200000000000000e+04,-2.187100000000000000e+04,3.624000000000000000e+03,1.940400000000000000e+04,-2.475200000000000000e+04,6.792000000000000000e+03
+2.304000000000000000e+03,1.136400000000000000e+04,-1.790400000000000000e+04,2.731200000000000000e+04,3.347000000000000000e+03,2.787000000000000000e+03,1.239000000000000000e+03,-1.933400000000000000e+04
+-1.873600000000000000e+04,-2.987300000000000000e+04,4.513000000000000000e+03,-2.252000000000000000e+04,2.063500000000000000e+04,-2.506900000000000000e+04,-2.050600000000000000e+04,3.260400000000000000e+04
+8.989000000000000000e+03,-2.437000000000000000e+04,-1.238600000000000000e+04,2.294200000000000000e+04,-2.392800000000000000e+04,-1.731600000000000000e+04,-1.353900000000000000e+04,5.650000000000000000e+02
+-1.099400000000000000e+04,1.802000000000000000e+04,1.584800000000000000e+04,2.587000000000000000e+03,-2.565300000000000000e+04,-1.128900000000000000e+04,2.140100000000000000e+04,8.156000000000000000e+03
+1.060400000000000000e+04,-1.178400000000000000e+04,6.412000000000000000e+03,1.780400000000000000e+04,-2.457900000000000000e+04,-1.187000000000000000e+04,3.088400000000000000e+04,-1.570000000000000000e+04
+-2.448100000000000000e+04,9.393000000000000000e+03,-1.179000000000000000e+03,1.893500000000000000e+04,2.741600000000000000e+04,2.821200000000000000e+04,2.124800000000000000e+04,-2.597700000000000000e+04
+1.553200000000000000e+04,-2.227300000000000000e+04,1.722000000000000000e+03,2.467100000000000000e+04,-1.128700000000000000e+04,1.981800000000000000e+04,-1.681500000000000000e+04,7.134000000000000000e+03
+2.982100000000000000e+04,2.851700000000000000e+04,2.612600000000000000e+04,3.106900000000000000e+04,2.232000000000000000e+03,1.179800000000000000e+04,-3.138700000000000000e+04,-3.093000000000000000e+04
+-3.193400000000000000e+04,-1.316400000000000000e+04,4.106000000000000000e+03,3.177600000000000000e+04,1.665000000000000000e+03,3.496000000000000000e+03,5.557000000000000000e+03,-6.483000000000000000e+03
+2.857000000000000000e+03,-9.361000000000000000e+03,2.245500000000000000e+04,1.088000000000000000e+03,-1.185400000000000000e+04,-3.239500000000000000e+04,-3.193400000000000000e+04,-6.768000000000000000e+03
+2.718000000000000000e+03,1.822100000000000000e+04,-1.758800000000000000e+04,-4.744000000000000000e+03,1.878500000000000000e+04,1.768000000000000000e+04,2.941000000000000000e+03,-5.660000000000000000e+02
+3.212300000000000000e+04,-9.715000000000000000e+03,-2.802400000000000000e+04,2.229700000000000000e+04,-1.389400000000000000e+04,2.319500000000000000e+04,-2.969200000000000000e+04,-2.499900000000000000e+04
+3.389000000000000000e+03,-1.652300000000000000e+04,2.056000000000000000e+03,8.969000000000000000e+03,2.437200000000000000e+04,-2.940600000000000000e+04,1.940600000000000000e+04,5.839000000000000000e+03
+-1.068700000000000000e+04,1.184600000000000000e+04,-9.750000000000000000e+03,3.677000000000000000e+03,3.128900000000000000e+04,4.790000000000000000e+03,2.033400000000000000e+04,3.979000000000000000e+03
+5.898000000000000000e+03,2.464900000000000000e+04,1.681500000000000000e+04,-2.610000000000000000e+02,-1.015200000000000000e+04,-4.572000000000000000e+03,-3.103300000000000000e+04,3.050800000000000000e+04
+1.251000000000000000e+04,2.435600000000000000e+04,1.613100000000000000e+04,-7.928000000000000000e+03,-4.224000000000000000e+03,-1.417600000000000000e+04,1.498000000000000000e+03,-2.097600000000000000e+04
+-2.602200000000000000e+04,5.688000000000000000e+03,1.932900000000000000e+04,8.434000000000000000e+03,-3.564000000000000000e+03,1.820000000000000000e+02,-3.045600000000000000e+04,-3.243000000000000000e+03
+-1.343900000000000000e+04,2.203000000000000000e+03,-3.265900000000000000e+04,2.699600000000000000e+04,-2.750800000000000000e+04,-1.428600000000000000e+04,-1.928100000000000000e+04,-9.613000000000000000e+03
+5.932000000000000000e+03,-5.671000000000000000e+03,-1.911000000000000000e+03,-2.105700000000000000e+04,1.667100000000000000e+04,-2.410300000000000000e+04,-5.186000000000000000e+03,7.524000000000000000e+03
+-2.810000000000000000e+04,-1.089200000000000000e+04,-2.059800000000000000e+04,-1.559800000000000000e+04,-1.587500000000000000e+04,-3.088000000000000000e+04,-1.384100000000000000e+04,-2.882700000000000000e+04
+-1.433300000000000000e+04,2.718100000000000000e+04,3.003500000000000000e+04,-3.785000000000000000e+03,3.131200000000000000e+04,-2.263000000000000000e+03,4.563000000000000000e+03,2.323000000000000000e+04
+-1.201900000000000000e+04,-2.596200000000000000e+04,2.824500000000000000e+04,-1.616500000000000000e+04,4.083000000000000000e+03,-1.462900000000000000e+04,-8.474000000000000000e+03,-2.576100000000000000e+04
+6.290000000000000000e+02,-2.307500000000000000e+04,-1.649600000000000000e+04,7.769000000000000000e+03,-1.865200000000000000e+04,2.647000000000000000e+04,1.364500000000000000e+04,9.974000000000000000e+03
+-1.649500000000000000e+04,2.921700000000000000e+04,5.501000000000000000e+03,-9.689000000000000000e+03,-2.906600000000000000e+04,-2.250400000000000000e+04,-4.127000000000000000e+03,1.546200000000000000e+04
+7.312000000000000000e+03,2.683200000000000000e+04,-2.886000000000000000e+03,2.164100000000000000e+04,-3.005300000000000000e+04,-5.761000000000000000e+03,-1.546100000000000000e+04,-2.473900000000000000e+04
+1.517700000000000000e+04,-2.636200000000000000e+04,-2.965500000000000000e+04,-1.492500000000000000e+04,-1.917600000000000000e+04,-1.100900000000000000e+04,-1.078000000000000000e+04,-1.755000000000000000e+03
+2.599400000000000000e+04,8.118000000000000000e+03,2.588800000000000000e+04,-1.022300000000000000e+04,-3.459000000000000000e+03,4.649000000000000000e+03,3.127700000000000000e+04,-2.472300000000000000e+04
+-2.373300000000000000e+04,1.975900000000000000e+04,1.071300000000000000e+04,-1.487900000000000000e+04,-1.656300000000000000e+04,2.675000000000000000e+04,-3.169600000000000000e+04,1.746700000000000000e+04
+6.851000000000000000e+03,3.261400000000000000e+04,2.414000000000000000e+03,-1.464000000000000000e+04,1.449000000000000000e+03,3.156200000000000000e+04,2.416000000000000000e+03,-6.229000000000000000e+03
+-3.024400000000000000e+04,1.087200000000000000e+04,-2.374200000000000000e+04,1.354600000000000000e+04,-5.255000000000000000e+03,-9.880000000000000000e+02,-2.911100000000000000e+04,3.177700000000000000e+04
+-1.427000000000000000e+04,-2.342200000000000000e+04,-3.047900000000000000e+04,7.836000000000000000e+03,-2.478800000000000000e+04,2.743800000000000000e+04,1.671200000000000000e+04,5.935000000000000000e+03
+-8.200000000000000000e+01,2.890000000000000000e+03,3.254200000000000000e+04,-8.496000000000000000e+03,-3.049800000000000000e+04,-3.060800000000000000e+04,8.851000000000000000e+03,6.119000000000000000e+03
+3.074200000000000000e+04,1.811500000000000000e+04,-1.902800000000000000e+04,-3.732000000000000000e+03,-1.490200000000000000e+04,-1.774400000000000000e+04,-2.804000000000000000e+03,-6.760000000000000000e+03
+3.115800000000000000e+04,-8.253000000000000000e+03,7.319000000000000000e+03,-1.169000000000000000e+04,1.700800000000000000e+04,1.537100000000000000e+04,3.112000000000000000e+03,3.248100000000000000e+04
+6.340000000000000000e+02,3.029700000000000000e+04,3.072400000000000000e+04,6.967000000000000000e+03,-2.535700000000000000e+04,-2.806300000000000000e+04,2.002900000000000000e+04,-1.695400000000000000e+04
+7.477000000000000000e+03,1.602600000000000000e+04,1.492100000000000000e+04,2.960100000000000000e+04,1.152500000000000000e+04,-1.368300000000000000e+04,-4.390000000000000000e+02,-2.803400000000000000e+04
+-1.273600000000000000e+04,6.472000000000000000e+03,-6.050000000000000000e+03,2.662000000000000000e+03,-2.510500000000000000e+04,-1.032800000000000000e+04,-1.931400000000000000e+04,2.402000000000000000e+03
+2.514300000000000000e+04,-1.959600000000000000e+04,-8.183000000000000000e+03,2.180600000000000000e+04,8.676000000000000000e+03,-1.240800000000000000e+04,-2.369200000000000000e+04,-2.126400000000000000e+04
+-1.554000000000000000e+04,-1.654800000000000000e+04,-3.244400000000000000e+04,1.719200000000000000e+04,-2.125400000000000000e+04,3.030900000000000000e+04,1.308300000000000000e+04,1.742500000000000000e+04
+3.059400000000000000e+04,-2.528700000000000000e+04,-9.130000000000000000e+03,2.573700000000000000e+04,-1.486200000000000000e+04,-8.582000000000000000e+03,-1.521000000000000000e+03,2.762700000000000000e+04
+2.648600000000000000e+04,-1.967800000000000000e+04,2.829900000000000000e+04,-1.748900000000000000e+04,1.976900000000000000e+04,-7.630000000000000000e+02,1.531100000000000000e+04,1.440100000000000000e+04
+-3.107900000000000000e+04,1.774700000000000000e+04,5.782000000000000000e+03,-2.142100000000000000e+04,-9.977000000000000000e+03,-2.059000000000000000e+03,1.768600000000000000e+04,2.904600000000000000e+04
+5.543000000000000000e+03,-1.058300000000000000e+04,-7.955000000000000000e+03,-3.090000000000000000e+02,7.523000000000000000e+03,-1.984300000000000000e+04,-1.641800000000000000e+04,1.120000000000000000e+03
+3.568000000000000000e+03,1.452900000000000000e+04,1.045700000000000000e+04,-1.939100000000000000e+04,-2.413900000000000000e+04,-3.775000000000000000e+03,2.893000000000000000e+03,-1.216800000000000000e+04
+-3.228600000000000000e+04,-1.538600000000000000e+04,-2.795500000000000000e+04,6.214000000000000000e+03,2.296300000000000000e+04,7.639000000000000000e+03,8.214000000000000000e+03,-2.776500000000000000e+04
+7.540000000000000000e+02,-2.588500000000000000e+04,-2.242400000000000000e+04,2.554900000000000000e+04,-3.149800000000000000e+04,-1.718300000000000000e+04,-2.830000000000000000e+03,2.030900000000000000e+04
+6.080000000000000000e+03,-1.375000000000000000e+04,2.449000000000000000e+03,-2.875600000000000000e+04,-2.349300000000000000e+04,-2.282200000000000000e+04,2.341800000000000000e+04,2.132000000000000000e+04

+ 49 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/kernel.txt

@@ -0,0 +1,49 @@
+# 2,3,8,1
+1.687400000000000000e+04
+-3.052000000000000000e+03
+-2.187000000000000000e+04
+2.320500000000000000e+04
+-2.211400000000000000e+04
+-4.300000000000000000e+03
+2.142800000000000000e+04
+2.061200000000000000e+04
+1.543000000000000000e+03
+-1.133500000000000000e+04
+-1.322700000000000000e+04
+1.866800000000000000e+04
+2.069200000000000000e+04
+-1.480400000000000000e+04
+1.228800000000000000e+04
+-2.144400000000000000e+04
+-1.433200000000000000e+04
+1.148200000000000000e+04
+1.150700000000000000e+04
+3.079100000000000000e+04
+9.215000000000000000e+03
+-1.604800000000000000e+04
+-2.695000000000000000e+03
+3.236200000000000000e+04
+-1.708800000000000000e+04
+-9.248000000000000000e+03
+2.567700000000000000e+04
+-2.467300000000000000e+04
+-1.088800000000000000e+04
+4.489000000000000000e+03
+-2.311000000000000000e+03
+5.757000000000000000e+03
+-3.092400000000000000e+04
+-9.495000000000000000e+03
+7.470000000000000000e+02
+5.188000000000000000e+03
+-3.144800000000000000e+04
+-2.247800000000000000e+04
+1.640600000000000000e+04
+1.274700000000000000e+04
+-4.634000000000000000e+03
+-8.142000000000000000e+03
+-5.032000000000000000e+03
+-2.549000000000000000e+03
+1.860000000000000000e+02
+9.170000000000000000e+03
+3.372000000000000000e+03
+-1.727500000000000000e+04

+ 17 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_null_bias/input.txt

@@ -0,0 +1,17 @@
+# 1,4,4,8
+-1.857700000000000000e+04,2.208200000000000000e+04,1.901500000000000000e+04,9.939000000000000000e+03,-9.865000000000000000e+03,-2.051200000000000000e+04,-9.998000000000000000e+03,-2.266500000000000000e+04
+-2.129000000000000000e+04,-1.967200000000000000e+04,4.065000000000000000e+03,-1.061700000000000000e+04,1.641000000000000000e+04,3.161100000000000000e+04,-7.291000000000000000e+03,2.192700000000000000e+04
+-1.942000000000000000e+04,4.760000000000000000e+02,-2.439500000000000000e+04,9.624000000000000000e+03,2.733000000000000000e+04,7.170000000000000000e+02,7.899000000000000000e+03,5.117000000000000000e+03
+2.449000000000000000e+04,-3.250700000000000000e+04,1.771000000000000000e+03,-9.298000000000000000e+03,3.005900000000000000e+04,4.552000000000000000e+03,4.894000000000000000e+03,3.098300000000000000e+04
+2.786800000000000000e+04,-4.336000000000000000e+03,2.837700000000000000e+04,-3.259900000000000000e+04,-2.879500000000000000e+04,-1.817900000000000000e+04,1.406800000000000000e+04,1.004800000000000000e+04
+-1.337200000000000000e+04,-1.781300000000000000e+04,5.292000000000000000e+03,5.836000000000000000e+03,2.725700000000000000e+04,-1.773000000000000000e+04,2.258000000000000000e+04,3.089700000000000000e+04
+2.483700000000000000e+04,3.140700000000000000e+04,-3.356000000000000000e+03,-2.739400000000000000e+04,2.332400000000000000e+04,-1.121000000000000000e+03,3.040500000000000000e+04,1.250600000000000000e+04
+-5.824000000000000000e+03,1.733300000000000000e+04,2.131600000000000000e+04,-3.119000000000000000e+03,1.214200000000000000e+04,-1.581200000000000000e+04,-8.362000000000000000e+03,1.348100000000000000e+04
+8.010000000000000000e+03,-1.640900000000000000e+04,-1.384000000000000000e+03,-9.159000000000000000e+03,-1.792900000000000000e+04,-2.046900000000000000e+04,5.666000000000000000e+03,7.685000000000000000e+03
+2.171500000000000000e+04,8.290000000000000000e+03,-1.860500000000000000e+04,2.908700000000000000e+04,2.482700000000000000e+04,-7.327000000000000000e+03,-3.181800000000000000e+04,-2.473100000000000000e+04
+2.892000000000000000e+04,-6.734000000000000000e+03,3.144300000000000000e+04,-2.070000000000000000e+03,-8.030000000000000000e+03,2.869000000000000000e+03,-2.698200000000000000e+04,1.526900000000000000e+04
+1.048100000000000000e+04,1.358200000000000000e+04,-3.042600000000000000e+04,4.356000000000000000e+03,-1.475800000000000000e+04,-1.125100000000000000e+04,1.597300000000000000e+04,1.967500000000000000e+04
+-2.068500000000000000e+04,8.198000000000000000e+03,-2.278100000000000000e+04,1.346900000000000000e+04,-2.744000000000000000e+04,3.354000000000000000e+03,-2.895300000000000000e+04,2.339800000000000000e+04
+2.490000000000000000e+04,2.579200000000000000e+04,-2.324400000000000000e+04,-2.801000000000000000e+03,-9.152000000000000000e+03,1.770200000000000000e+04,2.128500000000000000e+04,-2.386400000000000000e+04
+-1.123100000000000000e+04,-1.282600000000000000e+04,-2.178000000000000000e+03,-2.346200000000000000e+04,-9.906000000000000000e+03,8.555000000000000000e+03,-6.807000000000000000e+03,-3.570000000000000000e+02
+-2.388200000000000000e+04,-3.225000000000000000e+04,2.946600000000000000e+04,1.825500000000000000e+04,1.583600000000000000e+04,-2.913500000000000000e+04,-1.872400000000000000e+04,-2.574300000000000000e+04

+ 33 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_null_bias/kernel.txt

@@ -0,0 +1,33 @@
+# 2,2,8,1
+-6.966000000000000000e+03
+-6.832000000000000000e+03
+1.216900000000000000e+04
+3.284000000000000000e+03
+-2.968100000000000000e+04
+2.983400000000000000e+04
+1.598500000000000000e+04
+2.407200000000000000e+04
+-2.409500000000000000e+04
+-9.319000000000000000e+03
+1.043500000000000000e+04
+1.787700000000000000e+04
+1.350500000000000000e+04
+3.029500000000000000e+04
+9.635000000000000000e+03
+2.507400000000000000e+04
+-2.418500000000000000e+04
+1.819200000000000000e+04
+2.434400000000000000e+04
+-3.168100000000000000e+04
+2.931800000000000000e+04
+-2.652300000000000000e+04
+-1.711800000000000000e+04
+-4.391000000000000000e+03
+1.497000000000000000e+03
+2.695800000000000000e+04
+9.153000000000000000e+03
+-6.281000000000000000e+03
+3.063400000000000000e+04
+-1.109700000000000000e+04
+2.385300000000000000e+04
+-2.211300000000000000e+04

+ 49 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_spill_null_bias/input.txt

@@ -0,0 +1,49 @@
+# 3,4,4,5
+-7.319000000000000000e+03,2.429900000000000000e+04,-2.701500000000000000e+04,1.816400000000000000e+04,-2.412800000000000000e+04
+1.886000000000000000e+03,-6.480000000000000000e+03,3.192200000000000000e+04,-3.276700000000000000e+04,3.153900000000000000e+04
+-1.918800000000000000e+04,-2.647800000000000000e+04,-2.564000000000000000e+03,2.787300000000000000e+04,-5.675000000000000000e+03
+7.433000000000000000e+03,3.115900000000000000e+04,-1.389800000000000000e+04,5.334000000000000000e+03,1.596200000000000000e+04
+-1.190000000000000000e+04,-1.670800000000000000e+04,5.430000000000000000e+03,-9.333000000000000000e+03,-2.557000000000000000e+04
+2.519100000000000000e+04,3.118300000000000000e+04,2.879800000000000000e+04,1.172400000000000000e+04,3.145200000000000000e+04
+-1.654600000000000000e+04,3.181200000000000000e+04,-3.131300000000000000e+04,-1.796800000000000000e+04,-3.231000000000000000e+03
+-2.941400000000000000e+04,8.278000000000000000e+03,3.366000000000000000e+03,-1.947800000000000000e+04,9.861000000000000000e+03
+2.360000000000000000e+04,3.141400000000000000e+04,1.224000000000000000e+03,-3.213900000000000000e+04,2.826200000000000000e+04
+-2.621500000000000000e+04,3.336000000000000000e+03,1.701100000000000000e+04,-2.727300000000000000e+04,-2.001300000000000000e+04
+-2.811100000000000000e+04,2.001200000000000000e+04,6.270000000000000000e+03,-1.694900000000000000e+04,-4.120000000000000000e+03
+1.989200000000000000e+04,1.224400000000000000e+04,-2.741400000000000000e+04,-2.561500000000000000e+04,1.869600000000000000e+04
+-2.366000000000000000e+04,-2.076800000000000000e+04,2.692200000000000000e+04,3.737000000000000000e+03,1.541900000000000000e+04
+5.561000000000000000e+03,-2.868000000000000000e+03,-1.664100000000000000e+04,8.907000000000000000e+03,1.355000000000000000e+04
+3.214100000000000000e+04,-1.790400000000000000e+04,-3.154000000000000000e+04,-2.679300000000000000e+04,2.555200000000000000e+04
+2.885600000000000000e+04,3.157600000000000000e+04,6.202000000000000000e+03,-1.488800000000000000e+04,2.376200000000000000e+04
+-8.198000000000000000e+03,9.875000000000000000e+03,-1.312600000000000000e+04,-1.693100000000000000e+04,-9.526000000000000000e+03
+-1.605500000000000000e+04,2.672700000000000000e+04,2.704100000000000000e+04,-1.252900000000000000e+04,8.293000000000000000e+03
+5.669000000000000000e+03,6.974000000000000000e+03,5.096000000000000000e+03,2.317300000000000000e+04,7.041000000000000000e+03
+-2.314000000000000000e+04,-1.832100000000000000e+04,3.989000000000000000e+03,2.079700000000000000e+04,-2.613900000000000000e+04
+-1.948000000000000000e+04,1.581800000000000000e+04,-3.253000000000000000e+03,-3.240100000000000000e+04,1.478900000000000000e+04
+-2.821500000000000000e+04,2.850300000000000000e+04,7.393000000000000000e+03,1.232500000000000000e+04,7.971000000000000000e+03
+-2.889600000000000000e+04,-2.121300000000000000e+04,-2.173900000000000000e+04,3.057600000000000000e+04,-1.013500000000000000e+04
+4.547000000000000000e+03,-1.594000000000000000e+03,3.162700000000000000e+04,-7.316000000000000000e+03,-1.917000000000000000e+03
+3.168500000000000000e+04,1.691700000000000000e+04,2.938700000000000000e+04,1.745300000000000000e+04,-2.403000000000000000e+04
+-2.852600000000000000e+04,-2.540300000000000000e+04,1.647300000000000000e+04,-3.129400000000000000e+04,-2.136600000000000000e+04
+2.998900000000000000e+04,2.488300000000000000e+04,-4.928000000000000000e+03,2.724100000000000000e+04,8.780000000000000000e+03
+-1.027000000000000000e+03,-1.847200000000000000e+04,-1.963400000000000000e+04,3.790000000000000000e+02,1.293800000000000000e+04
+-1.043600000000000000e+04,-1.728600000000000000e+04,2.427400000000000000e+04,2.109500000000000000e+04,-1.127200000000000000e+04
+1.912700000000000000e+04,-2.943700000000000000e+04,2.608100000000000000e+04,2.487000000000000000e+03,3.074000000000000000e+03
+-2.159200000000000000e+04,-3.022500000000000000e+04,2.899000000000000000e+03,-1.151200000000000000e+04,-2.633600000000000000e+04
+-1.193600000000000000e+04,-1.892500000000000000e+04,-2.196100000000000000e+04,-2.422500000000000000e+04,-3.209200000000000000e+04
+-7.743000000000000000e+03,2.801700000000000000e+04,7.708000000000000000e+03,3.289000000000000000e+03,1.363200000000000000e+04
+2.982500000000000000e+04,-2.397000000000000000e+03,1.392700000000000000e+04,1.681300000000000000e+04,1.546500000000000000e+04
+2.567000000000000000e+03,8.863000000000000000e+03,2.010600000000000000e+04,2.346700000000000000e+04,3.134400000000000000e+04
+-5.870000000000000000e+02,1.389200000000000000e+04,5.223000000000000000e+03,3.213800000000000000e+04,2.198200000000000000e+04
+-1.310000000000000000e+03,2.106000000000000000e+03,-1.471900000000000000e+04,1.361000000000000000e+03,-6.400000000000000000e+03
+-1.569700000000000000e+04,-2.556900000000000000e+04,4.561000000000000000e+03,1.776600000000000000e+04,1.316200000000000000e+04
+1.501100000000000000e+04,1.904000000000000000e+04,2.914200000000000000e+04,9.900000000000000000e+03,-5.609000000000000000e+03
+2.046700000000000000e+04,1.415500000000000000e+04,5.410000000000000000e+03,9.317000000000000000e+03,2.058900000000000000e+04
+-3.171600000000000000e+04,2.989100000000000000e+04,-1.812200000000000000e+04,1.178500000000000000e+04,3.168300000000000000e+04
+-6.612000000000000000e+03,-1.952200000000000000e+04,-3.132000000000000000e+04,8.707000000000000000e+03,-1.241200000000000000e+04
+3.052500000000000000e+04,-4.086000000000000000e+03,-1.875200000000000000e+04,3.171100000000000000e+04,-1.020800000000000000e+04
+2.101600000000000000e+04,-7.797000000000000000e+03,1.253900000000000000e+04,3.207500000000000000e+04,-2.303400000000000000e+04
+2.483000000000000000e+04,-1.619600000000000000e+04,2.850700000000000000e+04,3.044400000000000000e+04,-7.734000000000000000e+03
+-2.571900000000000000e+04,2.208200000000000000e+04,1.941100000000000000e+04,-2.953500000000000000e+04,-2.174600000000000000e+04
+-2.661200000000000000e+04,-2.151400000000000000e+04,-2.395400000000000000e+04,-2.084400000000000000e+04,-2.825400000000000000e+04
+1.724700000000000000e+04,-2.944400000000000000e+04,3.099600000000000000e+04,-6.366000000000000000e+03,1.848900000000000000e+04

+ 46 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_spill_null_bias/kernel.txt

@@ -0,0 +1,46 @@
+# 3,3,5,1
+-2.947300000000000000e+04
+2.344700000000000000e+04
+3.034100000000000000e+04
+8.450000000000000000e+03
+2.395000000000000000e+04
+2.188200000000000000e+04
+-2.413400000000000000e+04
+-1.370400000000000000e+04
+1.579600000000000000e+04
+-3.169700000000000000e+04
+3.029900000000000000e+04
+-1.318400000000000000e+04
+-2.559000000000000000e+03
+5.593000000000000000e+03
+1.061100000000000000e+04
+-2.498700000000000000e+04
+-2.823000000000000000e+04
+8.110000000000000000e+02
+9.135000000000000000e+03
+-7.581000000000000000e+03
+9.142000000000000000e+03
+5.179000000000000000e+03
+1.367200000000000000e+04
+1.369300000000000000e+04
+2.017400000000000000e+04
+-2.105600000000000000e+04
+8.654000000000000000e+03
+3.145000000000000000e+03
+3.022700000000000000e+04
+1.797400000000000000e+04
+2.311000000000000000e+04
+5.464000000000000000e+03
+2.594700000000000000e+04
+1.503900000000000000e+04
+1.574600000000000000e+04
+2.271700000000000000e+04
+-3.250200000000000000e+04
+-2.328400000000000000e+04
+-1.325300000000000000e+04
+1.076500000000000000e+04
+-1.993500000000000000e+04
+-1.922800000000000000e+04
+6.842000000000000000e+03
+-1.644500000000000000e+04
+-3.141600000000000000e+04

+ 33 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_stride_null_bias/input.txt

@@ -0,0 +1,33 @@
+# 2,4,4,8
+1.336300000000000000e+04,-2.706200000000000000e+04,1.368500000000000000e+04,-1.925600000000000000e+04,1.338600000000000000e+04,-2.481400000000000000e+04,-1.664100000000000000e+04,-8.110000000000000000e+03
+2.870700000000000000e+04,2.327000000000000000e+03,4.496000000000000000e+03,-2.801700000000000000e+04,-9.455000000000000000e+03,1.201400000000000000e+04,2.666000000000000000e+03,2.834500000000000000e+04
+-2.254400000000000000e+04,6.081000000000000000e+03,-2.147300000000000000e+04,-1.989700000000000000e+04,-3.191200000000000000e+04,2.458900000000000000e+04,1.586000000000000000e+03,3.373000000000000000e+03
+2.989500000000000000e+04,1.706100000000000000e+04,5.695000000000000000e+03,-2.813000000000000000e+04,2.018100000000000000e+04,1.643400000000000000e+04,2.913000000000000000e+03,1.169500000000000000e+04
+-3.875000000000000000e+03,-2.955000000000000000e+04,-3.478000000000000000e+03,-7.687000000000000000e+03,-2.597800000000000000e+04,-2.148400000000000000e+04,2.030500000000000000e+04,2.766800000000000000e+04
+-1.682100000000000000e+04,1.058800000000000000e+04,1.162400000000000000e+04,4.562000000000000000e+03,6.732000000000000000e+03,-2.422800000000000000e+04,-9.850000000000000000e+02,-1.547600000000000000e+04
+-1.177800000000000000e+04,2.239500000000000000e+04,1.981000000000000000e+03,-4.412000000000000000e+03,-2.537600000000000000e+04,2.777400000000000000e+04,5.225000000000000000e+03,-8.601000000000000000e+03
+-6.840000000000000000e+03,1.011000000000000000e+04,-8.133000000000000000e+03,-2.168000000000000000e+03,-2.262800000000000000e+04,-3.124900000000000000e+04,-1.999100000000000000e+04,3.760000000000000000e+03
+1.211600000000000000e+04,1.338800000000000000e+04,-1.854800000000000000e+04,1.526100000000000000e+04,-2.820900000000000000e+04,1.766700000000000000e+04,-8.684000000000000000e+03,-8.110000000000000000e+03
+8.420000000000000000e+02,1.476000000000000000e+04,-2.639200000000000000e+04,-2.242200000000000000e+04,1.630300000000000000e+04,-1.095400000000000000e+04,-2.298800000000000000e+04,6.944000000000000000e+03
+1.338400000000000000e+04,-8.538000000000000000e+03,2.478500000000000000e+04,-7.726000000000000000e+03,-3.206100000000000000e+04,-1.616700000000000000e+04,-9.307000000000000000e+03,2.114900000000000000e+04
+1.638800000000000000e+04,-8.987000000000000000e+03,-2.238900000000000000e+04,2.333600000000000000e+04,2.632400000000000000e+04,-7.298000000000000000e+03,-1.759500000000000000e+04,3.232400000000000000e+04
+2.517000000000000000e+04,2.251200000000000000e+04,-2.032000000000000000e+04,2.286100000000000000e+04,8.725000000000000000e+03,-5.506000000000000000e+03,1.077500000000000000e+04,-3.224800000000000000e+04
+8.500000000000000000e+03,2.927000000000000000e+03,1.117900000000000000e+04,-2.547200000000000000e+04,-1.960000000000000000e+04,-1.121700000000000000e+04,-1.463800000000000000e+04,1.395500000000000000e+04
+1.035600000000000000e+04,2.676200000000000000e+04,-2.915900000000000000e+04,2.290400000000000000e+04,-4.691000000000000000e+03,-4.378000000000000000e+03,2.274400000000000000e+04,1.680900000000000000e+04
+-1.386100000000000000e+04,-2.473000000000000000e+04,-1.753000000000000000e+03,-1.451100000000000000e+04,-1.769300000000000000e+04,5.360000000000000000e+02,1.392700000000000000e+04,1.868000000000000000e+03
+-9.317000000000000000e+03,-1.982800000000000000e+04,5.988000000000000000e+03,9.475000000000000000e+03,2.110100000000000000e+04,3.047500000000000000e+04,1.326000000000000000e+04,-2.810000000000000000e+02
+-1.622500000000000000e+04,-2.670800000000000000e+04,2.092200000000000000e+04,-1.711000000000000000e+04,2.963900000000000000e+04,1.584700000000000000e+04,2.953400000000000000e+04,2.710500000000000000e+04
+-2.444100000000000000e+04,-5.009000000000000000e+03,-1.065000000000000000e+03,1.174500000000000000e+04,-1.548700000000000000e+04,2.051500000000000000e+04,2.565100000000000000e+04,2.351500000000000000e+04
+4.366000000000000000e+03,9.555000000000000000e+03,-3.067200000000000000e+04,-9.989000000000000000e+03,2.319600000000000000e+04,-1.354900000000000000e+04,-5.370000000000000000e+03,2.538700000000000000e+04
+1.283000000000000000e+03,-9.193000000000000000e+03,1.219300000000000000e+04,-2.487500000000000000e+04,-1.239400000000000000e+04,-2.455500000000000000e+04,2.119300000000000000e+04,2.504000000000000000e+03
+-1.356200000000000000e+04,3.829000000000000000e+03,-9.883000000000000000e+03,3.917000000000000000e+03,-9.066000000000000000e+03,2.914700000000000000e+04,2.792200000000000000e+04,1.172700000000000000e+04
+-3.058800000000000000e+04,1.160400000000000000e+04,2.216300000000000000e+04,-3.817000000000000000e+03,-3.116800000000000000e+04,2.912900000000000000e+04,-1.960600000000000000e+04,1.821000000000000000e+04
+-6.665000000000000000e+03,-3.077400000000000000e+04,-1.956000000000000000e+03,2.248300000000000000e+04,-2.159100000000000000e+04,2.337000000000000000e+04,2.919000000000000000e+03,-1.680200000000000000e+04
+3.096300000000000000e+04,2.643100000000000000e+04,3.124500000000000000e+04,1.514800000000000000e+04,2.264500000000000000e+04,1.259300000000000000e+04,1.413000000000000000e+03,1.048700000000000000e+04
+1.287900000000000000e+04,-1.117700000000000000e+04,4.104000000000000000e+03,-5.148000000000000000e+03,-1.215800000000000000e+04,-8.878000000000000000e+03,1.300700000000000000e+04,1.613600000000000000e+04
+9.350000000000000000e+03,-6.840000000000000000e+03,3.851000000000000000e+03,-2.257700000000000000e+04,2.656000000000000000e+03,-2.970600000000000000e+04,-1.898700000000000000e+04,3.223100000000000000e+04
+2.906300000000000000e+04,-1.620700000000000000e+04,2.515900000000000000e+04,2.324200000000000000e+04,3.275300000000000000e+04,2.673800000000000000e+04,2.373200000000000000e+04,-1.251500000000000000e+04
+1.595300000000000000e+04,3.253000000000000000e+04,-2.427700000000000000e+04,-8.282000000000000000e+03,-4.189000000000000000e+03,-3.245900000000000000e+04,2.595300000000000000e+04,-1.848500000000000000e+04
+1.700300000000000000e+04,9.567000000000000000e+03,-4.648000000000000000e+03,-2.588300000000000000e+04,6.010000000000000000e+02,-7.937000000000000000e+03,-2.580800000000000000e+04,-1.179300000000000000e+04
+-1.749000000000000000e+03,-1.667000000000000000e+03,1.763900000000000000e+04,-9.424000000000000000e+03,-7.337000000000000000e+03,1.054900000000000000e+04,3.060700000000000000e+04,-1.617400000000000000e+04
+7.250000000000000000e+02,9.412000000000000000e+03,2.952200000000000000e+04,-1.288000000000000000e+03,-3.164700000000000000e+04,8.673000000000000000e+03,-3.100800000000000000e+04,4.926000000000000000e+03

+ 33 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_stride_null_bias/kernel.txt

@@ -0,0 +1,33 @@
+# 2,2,8,1
+-1.239900000000000000e+04
+2.623300000000000000e+04
+-1.829900000000000000e+04
+-2.663200000000000000e+04
+8.750000000000000000e+02
+-2.812100000000000000e+04
+2.181200000000000000e+04
+-1.189000000000000000e+03
+8.809000000000000000e+03
+-9.199000000000000000e+03
+-1.871100000000000000e+04
+3.102000000000000000e+04
+-1.085100000000000000e+04
+-1.214700000000000000e+04
+3.160900000000000000e+04
+2.392100000000000000e+04
+-1.666400000000000000e+04
+-8.508000000000000000e+03
+-3.058000000000000000e+04
+-2.020400000000000000e+04
+1.101600000000000000e+04
+-1.556800000000000000e+04
+1.198500000000000000e+04
+-1.217000000000000000e+03
+-2.224500000000000000e+04
+-1.610800000000000000e+04
+3.009200000000000000e+04
+-1.967900000000000000e+04
+1.926800000000000000e+04
+3.742000000000000000e+03
+1.106700000000000000e+04
+9.852000000000000000e+03

+ 2 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_test_bias/bias.txt

@@ -0,0 +1,2 @@
+# 8
+-5.999000000000000000e+03,-3.030800000000000000e+04,8.504000000000000000e+03,-1.410900000000000000e+04,8.216000000000000000e+03,-2.707500000000000000e+04,1.136300000000000000e+04,-9.895000000000000000e+03

+ 17 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_test_bias/input.txt

@@ -0,0 +1,17 @@
+# 1,4,4,8
+2.621100000000000000e+04,4.731000000000000000e+03,-2.285900000000000000e+04,1.169700000000000000e+04,1.058500000000000000e+04,1.194300000000000000e+04,-1.410000000000000000e+04,2.691400000000000000e+04
+1.231800000000000000e+04,2.586500000000000000e+04,2.130900000000000000e+04,-3.027800000000000000e+04,1.685900000000000000e+04,-2.241900000000000000e+04,-2.252000000000000000e+04,2.331600000000000000e+04
+-1.264400000000000000e+04,-2.134100000000000000e+04,1.293200000000000000e+04,2.727100000000000000e+04,-2.945900000000000000e+04,1.700200000000000000e+04,6.470000000000000000e+02,3.064400000000000000e+04
+1.513900000000000000e+04,2.465300000000000000e+04,-1.088700000000000000e+04,6.420000000000000000e+02,-2.301800000000000000e+04,3.393000000000000000e+03,2.235400000000000000e+04,3.202000000000000000e+04
+2.957900000000000000e+04,1.132500000000000000e+04,1.838100000000000000e+04,-2.000300000000000000e+04,3.111100000000000000e+04,1.260600000000000000e+04,1.929000000000000000e+04,2.790000000000000000e+03
+1.049400000000000000e+04,-1.174000000000000000e+03,-1.169200000000000000e+04,2.456200000000000000e+04,-2.573200000000000000e+04,2.158900000000000000e+04,-3.006300000000000000e+04,1.902000000000000000e+04
+1.608000000000000000e+03,-6.749000000000000000e+03,-1.782500000000000000e+04,1.012400000000000000e+04,2.352600000000000000e+04,7.341000000000000000e+03,-6.651000000000000000e+03,-1.250000000000000000e+03
+-4.309000000000000000e+03,1.624800000000000000e+04,9.321000000000000000e+03,2.341700000000000000e+04,1.869200000000000000e+04,-4.153000000000000000e+03,1.344700000000000000e+04,2.776000000000000000e+04
+-3.234300000000000000e+04,7.500000000000000000e+01,-1.304000000000000000e+03,-1.849200000000000000e+04,-2.443300000000000000e+04,-2.503200000000000000e+04,2.108200000000000000e+04,-6.482000000000000000e+03
+7.326000000000000000e+03,2.392400000000000000e+04,-1.593300000000000000e+04,-2.659000000000000000e+03,1.601000000000000000e+04,2.337900000000000000e+04,2.083900000000000000e+04,-2.054400000000000000e+04
+-1.845800000000000000e+04,-9.760000000000000000e+03,-1.228600000000000000e+04,-1.962300000000000000e+04,-2.227800000000000000e+04,8.300000000000000000e+02,2.010000000000000000e+02,-1.896000000000000000e+03
+-1.030300000000000000e+04,2.078000000000000000e+03,3.044000000000000000e+04,-3.203700000000000000e+04,3.078800000000000000e+04,-3.003000000000000000e+03,-2.792900000000000000e+04,9.405000000000000000e+03
+1.177800000000000000e+04,-1.835300000000000000e+04,-2.124200000000000000e+04,-2.996000000000000000e+04,-1.238600000000000000e+04,-2.978500000000000000e+04,-1.717900000000000000e+04,-4.808000000000000000e+03
+1.162400000000000000e+04,-1.508800000000000000e+04,4.166000000000000000e+03,2.856200000000000000e+04,-2.787000000000000000e+04,2.766000000000000000e+04,1.706400000000000000e+04,-1.029800000000000000e+04
+1.238500000000000000e+04,8.103000000000000000e+03,7.981000000000000000e+03,-2.044700000000000000e+04,-1.109400000000000000e+04,1.143700000000000000e+04,1.407000000000000000e+04,1.775500000000000000e+04
+-2.494000000000000000e+04,-3.160800000000000000e+04,3.102900000000000000e+04,1.169800000000000000e+04,-2.776600000000000000e+04,3.006800000000000000e+04,-1.122100000000000000e+04,-2.901700000000000000e+04

+ 33 - 0
CMSIS/NN/Tests/UnitTest/PregeneratedData/dw_int16xint8_fast_test_bias/kernel.txt

@@ -0,0 +1,33 @@
+# 2,2,8,1
+1.714000000000000000e+03
+-2.184900000000000000e+04
+-2.476100000000000000e+04
+-1.361000000000000000e+03
+-1.957400000000000000e+04
+-3.218400000000000000e+04
+-6.840000000000000000e+02
+-3.275000000000000000e+03
+2.633100000000000000e+04
+-2.470000000000000000e+02
+-2.135600000000000000e+04
+1.088100000000000000e+04
+1.304900000000000000e+04
+-6.089000000000000000e+03
+2.986800000000000000e+04
+-3.164000000000000000e+03
+-3.136700000000000000e+04
+-2.364400000000000000e+04
+-2.861400000000000000e+04
+1.060000000000000000e+03
+-5.750000000000000000e+02
+-1.243600000000000000e+04
+-5.382000000000000000e+03
+-2.179700000000000000e+04
+1.570000000000000000e+02
+2.756000000000000000e+03
+-2.026600000000000000e+04
+-8.348000000000000000e+03
+-2.431200000000000000e+04
+2.118700000000000000e+04
+3.119300000000000000e+04
+-6.459000000000000000e+03

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/biases_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_fast_multiple_batches_uneven_buffers_biases[8] =
+    {-2863147, 3727567, 647963, 1377393, -795891, 3272794, 2461992, 3993595};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/config_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_CH 8
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_IN_CH 8
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_W 5
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_H 5
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DST_SIZE 216
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_SIZE 200
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_ACTIVATION_MIN -17000
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_BATCHES 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_FILTER_X 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_FILTER_Y 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_STRIDE_X 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_STRIDE_Y 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_PAD_X 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_PAD_Y 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUTPUT_W 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUTPUT_H 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_CH_MULT 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DILATION_X 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DILATION_Y 1

+ 48 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/input_data.h

@@ -0,0 +1,48 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_multiple_batches_uneven_buffers_input[600] = {
+    4514,   -6264,  22437,  -30081, -32731, 23333,  -6303,  -28718, -23436, -19999, -12455, -31875, -9662,  28476,
+    20182,  14105,  -12515, 25491,  17781,  -15185, -19187, -11760, -12623, -9167,  4279,   32628,  -25580, 6303,
+    -26209, -28946, 2036,   8196,   17444,  4500,   3847,   -16039, 28642,  -8731,  22147,  -794,   18883,  10889,
+    12,     -6815,  -829,   -17217, 8164,   19283,  -21613, 17538,  21293,  -5966,  -27753, 9093,   13487,  -4789,
+    6998,   27037,  17152,  -24326, 10389,  -23716, 21672,  -12884, -26884, 12992,  -16245, 11014,  29250,  25922,
+    3946,   -24343, 19052,  12153,  -25022, 4532,   19523,  18166,  -5593,  -6730,  -25788, 12771,  708,    -20099,
+    22645,  11588,  -16490, -7065,  -12015, 15634,  30526,  28277,  24752,  27168,  11921,  14298,  32338,  31185,
+    -26667, -21056, -30282, -6235,  -12590, 2642,   -16833, 4775,   -16080, 18708,  3449,   12126,  26941,  29131,
+    24950,  -12991, 13927,  -30112, -22976, 19872,  15633,  15482,  8058,   -11582, 1227,   28262,  21935,  9733,
+    -28713, 4935,   -25762, -23973, 13208,  -7388,  13598,  17013,  19550,  27131,  -11619, 8183,   31959,  -18481,
+    -14343, 30288,  -27974, -1886,  12533,  14975,  6799,   32063,  23120,  18553,  6933,   -12000, 16429,  -28181,
+    11131,  7045,   8840,   -10511, -21404, 30262,  10241,  -20465, -15976, -4,     -22973, -930,   32038,  1851,
+    -32406, 6362,   -12185, -26227, 31353,  -17698, 20273,  18172,  -14618, -9049,  26638,  31722,  45,     -8207,
+    -7542,  30256,  3977,   -3803,  -6135,  -6511,  -7829,  4396,   -28361, 23249,  2127,   -19569, 15107,  -21679,
+    32234,  8862,   15172,  20597,  2222,   31781,  -16724, -13023, 8696,   5212,   -30365, 7750,   7714,   -4491,
+    16464,  21014,  -21458, -31910, 7383,   -30106, 27007,  354,    -28719, -21813, 2960,   22159,  10619,  30990,
+    -21772, -17183, -25272, 3563,   4941,   12123,  2685,   -23432, -27647, 23780,  17419,  -24723, -27121, -4803,
+    -483,   1208,   -13475, -17251, 19420,  -7826,  17066,  7540,   22448,  2667,   -14807, 8150,   -26884, -7742,
+    11747,  -30398, -13370, 18653,  23167,  30578,  10844,  29101,  9598,   31462,  -5056,  24038,  -8900,  -32158,
+    -24679, -23664, 7600,   -11484, 10454,  23724,  17775,  2612,   -13031, -5835,  20910,  -30789, -3334,  5527,
+    -16826, -23394, -2831,  -31304, 17841,  8784,   4300,   -24155, 2094,   3747,   25420,  16977,  20779,  15670,
+    21855,  -26159, -23941, -27784, -254,   -5185,  -9459,  -25730, 16344,  10418,  -22875, 15273,  -32630, -17483,
+    -22538, -21422, -8329,  10488,  11165,  -2564,  5247,   28814,  3213,   -23692, -30407, 1849,   -18588, -22198,
+    -7200,  -6210,  -14329, 13737,  -3413,  -24955, 14352,  -12349, -3199,  23146,  -25870, 27947,  14498,  1671,
+    -15063, 1331,   -7660,  1660,   -14907, -17594, 28781,  19170,  -26972, 13941,  27888,  -20807, 8100,   20663,
+    4881,   -4664,  24648,  -15348, 26988,  3764,   7064,   -19698, 28662,  4967,   -31447, -3990,  1319,   10216,
+    -2404,  5452,   -6171,  16105,  28784,  -23540, -6038,  -20267, 6141,   22688,  -18637, 16497,  -25589, -26877,
+    8084,   2963,   12956,  -3018,  -13463, -25959, 7863,   -31553, 7655,   -22544, -23891, -22076, -5237,  30994,
+    24142,  21414,  -20809, -17923, -917,   -28987, -6960,  -1285,  14088,  18731,  6776,   -5997,  -320,   13532,
+    7767,   31781,  -8852,  5386,   21099,  -2248,  -21580, -2943,  18782,  -27158, -1761,  -30156, 22769,  -21249,
+    24524,  31175,  -20822, -9223,  6151,   -15552, 24756,  23795,  4022,   19853,  -9712,  30430,  -12391, -4296,
+    -8929,  21537,  14796,  27142,  -10424, -16581, -3041,  -13544, -26992, -18890, -22462, -6774,  11628,  1018,
+    -25621, 18353,  -17908, -19922, 26285,  10790,  24655,  22802,  9859,   15660,  20643,  26268,  -23524, -23895,
+    -11388, -11698, 22408,  -29754, -10488, -30398, 13225,  10755,  -26316, 23102,  27311,  30659,  -32045, -2260,
+    -11195, 23198,  20894,  9002,   22270,  12538,  25690,  12697,  -17387, 31740,  -7549,  -2576,  -10840, 14290,
+    15773,  12220,  -9868,  -30531, 1334,   8932,   10431,  28074,  15104,  -4137,  18301,  32552,  -4026,  -19388,
+    -11370, 30801,  -21928, -3470,  24812,  9177,   -20412, -30332, -2986,  -31258, -11170, 20691,  16482,  32478,
+    -6432,  20833,  5603,   -7879,  16721,  31311,  4882,   -16843, 28349,  26403,  1821,   17745,  15941,  -32241,
+    -31464, 31072,  -882,   -7897,  16385,  -674,   -16858, -13992, -17240, -30765, 25568,  22085,  -18877, -15025,
+    -25175, -4546,  10941,  -11285, -27707, 12754,  -15004, -6357,  10336,  -21846, 25039,  -12294, 29553,  18892,
+    3053,   14698,  32404,  -25825, -2887,  19564,  -23601, 12855,  -11674, -4663,  19532,  -11157, 15800,  17225,
+    28701,  10200,  26472,  -15691, 13591,  -2531,  14006,  -18312, 27497,  11402,  27889,  -23348, -12688, -28639,
+    29653,  4195,   27327,  -12486, 20890,  5154,   23824,  17032,  26154,  16863,  31596,  18617};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/output_mult_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_multiple_batches_uneven_buffers_output_mult[8] =
+    {1554034767, 1505895962, 1583401714, 1494720058, 1472755273, 1241592497, 1349384561, 1553647651};

+ 19 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/output_ref_data.h

@@ -0,0 +1,19 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_multiple_batches_uneven_buffers_output_ref[216] = {
+    -568,   772,   -1403, 13165, -9121, 4293,  -1850, 9930,  2107,   -1542, 5116,  -4174, -866,  -4346, 6988,  15155,
+    -3608,  -942,  -36,   7705,  -2565, 6334,  -238,  19493, 4860,   9163,  -6674, 10025, -9959, 597,   864,   10148,
+    -3263,  4730,  9571,  -6187, 4213,  -710,  -1252, 123,   -2355,  9267,  -6429, 14192, 473,   4450,  2632,  -4260,
+    14614,  11928, 6422,  -6437, -600,  -1051, 10500, 7347,  2969,   2477,  -773,  989,   -9584, -981,  -2685, 11460,
+    -3594,  10947, -8858, 4277,  7250,  1390,  1504,  11834, 7273,   13839, 15247, -3489, -9153, 3266,  -995,  -12885,
+    -6326,  4127,  -2876, 12542, -5994, 8896,  4732,  472,   -3883,  13747, -3800, -7439, 3500,  792,   3022,  -341,
+    7108,   14083, -6998, 12815, -383,  3831,  8668,  10743, 1396,   -5059, 5149,  -929,  5182,  1989,  2713,  8745,
+    3535,   12276, -626,  -6494, 1218,  7325,  10026, 5821,  -317,   7016,  3550,  2645,  1713,  -394,  -6310, 2254,
+    -5464,  22817, 5564,  4701,  -7853, -4662, 2489,  697,   8897,   6466,  -7167, 180,   -1425, 7021,  2245,  8549,
+    -5591,  -8521, -5773, -3925, 3446,  1732,  6356,  3144,  1375,   -2206, -1414, 15335, 5451,  7415,  7940,  -9076,
+    -8942,  555,   7867,  -6363, -5179, 1817,  -2758, 509,   -3065,  2414,  -3456, 5141,  1712,  1068,  10512, 13037,
+    1309,   -2115, 7790,  -3325, 3501,  2769,  13190, 15204, -2078,  19354, 7206,  1672,  6171,  3043,  2681,  15449,
+    -8469,  9272,  -6826, -5968, 1125,  4458,  25,    6598,  -13947, 14310, 9582,  3401,  1542,  1838,  16452, 3780,
+    -16248, 2185,  -7568, 3032,  -2282, 9617,  4851,  -7862};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/output_shift_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_multiple_batches_uneven_buffers_output_shift[8] = {-9, -9, -9, -9, -9, -9, -9, -9};

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/test_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"

+ 9 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/weights_data.h

@@ -0,0 +1,9 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_fast_multiple_batches_uneven_buffers_weights[72] = {
+    -127, 36,  -22,  -31,  -45,  -71, 127, 29,   -91, -95, 127,  -79,  32,   3,    -33,  66,  88,   112,
+    -107, 46,  -21,  48,   -115, -70, -58, 1,    61,  -43, -3,   -127, -60,  -77,  -91,  6,   -81,  127,
+    -18,  -33, -2,   -122, 6,    -39, -3,  -107, -3,  -36, -113, -120, -121, -116, -118, -18, -107, -75,
+    49,   45,  -119, -84,  28,   76,  -72, 26,   63,  127, -107, -127, -126, -43,  127,  -14, 74,   57};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/biases_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_biases[8] = {0, 0, 0, 0, 0, 0, 0, 0};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/config_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_CH 8
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_IN_CH 8
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_W 4
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_H 4
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DST_SIZE 144
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_SIZE 128
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_ACTIVATION_MIN -17000
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_BATCHES 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_FILTER_X 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_FILTER_Y 2
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_STRIDE_X 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_STRIDE_Y 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_PAD_X 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_PAD_Y 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUTPUT_W 2
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUTPUT_H 3
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_CH_MULT 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DILATION_X 1
+#define DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DILATION_Y 1

+ 33 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/input_data.h

@@ -0,0 +1,33 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_input[384] = {
+    15228,  9887,   -14372, -21871, 3624,   19404,  -24752, 6792,   2304,   11364,  -17904, 27312,  3347,   2787,
+    1239,   -19334, -18736, -29873, 4513,   -22520, 20635,  -25069, -20506, 32604,  8989,   -24370, -12386, 22942,
+    -23928, -17316, -13539, 565,    -10994, 18020,  15848,  2587,   -25653, -11289, 21401,  8156,   10604,  -11784,
+    6412,   17804,  -24579, -11870, 30884,  -15700, -24481, 9393,   -1179,  18935,  27416,  28212,  21248,  -25977,
+    15532,  -22273, 1722,   24671,  -11287, 19818,  -16815, 7134,   29821,  28517,  26126,  31069,  2232,   11798,
+    -31387, -30930, -31934, -13164, 4106,   31776,  1665,   3496,   5557,   -6483,  2857,   -9361,  22455,  1088,
+    -11854, -32395, -31934, -6768,  2718,   18221,  -17588, -4744,  18785,  17680,  2941,   -566,   32123,  -9715,
+    -28024, 22297,  -13894, 23195,  -29692, -24999, 3389,   -16523, 2056,   8969,   24372,  -29406, 19406,  5839,
+    -10687, 11846,  -9750,  3677,   31289,  4790,   20334,  3979,   5898,   24649,  16815,  -261,   -10152, -4572,
+    -31033, 30508,  12510,  24356,  16131,  -7928,  -4224,  -14176, 1498,   -20976, -26022, 5688,   19329,  8434,
+    -3564,  182,    -30456, -3243,  -13439, 2203,   -32659, 26996,  -27508, -14286, -19281, -9613,  5932,   -5671,
+    -1911,  -21057, 16671,  -24103, -5186,  7524,   -28100, -10892, -20598, -15598, -15875, -30880, -13841, -28827,
+    -14333, 27181,  30035,  -3785,  31312,  -2263,  4563,   23230,  -12019, -25962, 28245,  -16165, 4083,   -14629,
+    -8474,  -25761, 629,    -23075, -16496, 7769,   -18652, 26470,  13645,  9974,   -16495, 29217,  5501,   -9689,
+    -29066, -22504, -4127,  15462,  7312,   26832,  -2886,  21641,  -30053, -5761,  -15461, -24739, 15177,  -26362,
+    -29655, -14925, -19176, -11009, -10780, -1755,  25994,  8118,   25888,  -10223, -3459,  4649,   31277,  -24723,
+    -23733, 19759,  10713,  -14879, -16563, 26750,  -31696, 17467,  6851,   32614,  2414,   -14640, 1449,   31562,
+    2416,   -6229,  -30244, 10872,  -23742, 13546,  -5255,  -988,   -29111, 31777,  -14270, -23422, -30479, 7836,
+    -24788, 27438,  16712,  5935,   -82,    2890,   32542,  -8496,  -30498, -30608, 8851,   6119,   30742,  18115,
+    -19028, -3732,  -14902, -17744, -2804,  -6760,  31158,  -8253,  7319,   -11690, 17008,  15371,  3112,   32481,
+    634,    30297,  30724,  6967,   -25357, -28063, 20029,  -16954, 7477,   16026,  14921,  29601,  11525,  -13683,
+    -439,   -28034, -12736, 6472,   -6050,  2662,   -25105, -10328, -19314, 2402,   25143,  -19596, -8183,  21806,
+    8676,   -12408, -23692, -21264, -15540, -16548, -32444, 17192,  -21254, 30309,  13083,  17425,  30594,  -25287,
+    -9130,  25737,  -14862, -8582,  -1521,  27627,  26486,  -19678, 28299,  -17489, 19769,  -763,   15311,  14401,
+    -31079, 17747,  5782,   -21421, -9977,  -2059,  17686,  29046,  5543,   -10583, -7955,  -309,   7523,   -19843,
+    -16418, 1120,   3568,   14529,  10457,  -19391, -24139, -3775,  2893,   -12168, -32286, -15386, -27955, 6214,
+    22963,  7639,   8214,   -27765, 754,    -25885, -22424, 25549,  -31498, -17183, -2830,  20309,  6080,   -13750,
+    2449,   -28756, -23493, -22822, 23418,  21320};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/output_mult_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_output_mult[8] =
+    {1321976810, 1963386025, 1097671686, 1316291179, 1344377358, 1921833836, 1832060491, 1383450161};

+ 15 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/output_ref_data.h

@@ -0,0 +1,15 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_output_ref[144] = {
+    9898,   -12494, 20130,  -14150, 24454,  14940,  1320,   32767,  7637,  4416,   6792,  10113,  -9105,  2676,  565,
+    -17000, 12809,  2445,   2306,   7232,   4457,   -10952, 16570,  -9537, 7163,   -5208, 2130,   15009,  27017, 4309,
+    8751,   6882,   -3744,  2048,   -17000, 16478,  -14428, 24242,  -1216, -16981, -6438, 3848,   -12394, 8126,  -17000,
+    -1933,  -1974,  -8907,  26527,  -1189,  -17000, 23973,  -17000, 1327,  -4191,  -1999, 1403,   1502,   17036, 839,
+    -16099, 22956,  -17000, 344,    -6628,  -17000, 13221,  -10702, 32767, 6499,   -9820, -17000, -17000, -2337, -17000,
+    -14508, 3034,   1294,   -1537,  31893,  -3000,  -17000, -531,   -470,  -526,   -4772, -4614,  5220,   12959, 1882,
+    18722,  6328,   7409,   10422,  -17000, -17000, -5017,  -7074,  858,   -17000, 23387, 4023,   -4904,  30751, 1271,
+    12768,  13340,  -1802,  8795,   16018,  -7383,  -17000, -17000, -1418, -11483, 14649, -17000, 7342,   2578,  -17000,
+    10468,  2186,   12587,  26124,  12626,  -7891,  -10727, 29700,  32767, 14351,  5004,  1106,   3563,   -5246, 4470,
+    8394,   16173,  4699,   -17000, -15077, 3166,   11129,  11921,  -11093};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/output_shift_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_output_shift[8] =
+    {-7, -9, -7, -7, -7, -8, -8, -7};

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/test_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/weights_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_weights[48] = {
+    69,   -34,  -108, 96,  -89,  -24,  127, 81,  6,   -125, -65, 77,   84,  -84, 73,  -84,
+    -59,  127,  57,   127, 37,   -91,  -16, 127, -70, -102, 127, -102, -44, 25,  -14, 23,
+    -127, -105, 4,    21,  -127, -127, 97,  50,  -19, -90,  -25, -11,  1,   52,  20,  -68};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/biases_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_fast_null_bias_biases[8] = {0, 0, 0, 0, 0, 0, 0, 0};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/config_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#define DW_INT16XINT8_FAST_NULL_BIAS_OUT_CH 8
+#define DW_INT16XINT8_FAST_NULL_BIAS_IN_CH 8
+#define DW_INT16XINT8_FAST_NULL_BIAS_INPUT_W 4
+#define DW_INT16XINT8_FAST_NULL_BIAS_INPUT_H 4
+#define DW_INT16XINT8_FAST_NULL_BIAS_DST_SIZE 72
+#define DW_INT16XINT8_FAST_NULL_BIAS_INPUT_SIZE 128
+#define DW_INT16XINT8_FAST_NULL_BIAS_OUT_ACTIVATION_MIN -17000
+#define DW_INT16XINT8_FAST_NULL_BIAS_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_FAST_NULL_BIAS_INPUT_BATCHES 1
+#define DW_INT16XINT8_FAST_NULL_BIAS_FILTER_X 2
+#define DW_INT16XINT8_FAST_NULL_BIAS_FILTER_Y 2
+#define DW_INT16XINT8_FAST_NULL_BIAS_STRIDE_X 1
+#define DW_INT16XINT8_FAST_NULL_BIAS_STRIDE_Y 1
+#define DW_INT16XINT8_FAST_NULL_BIAS_PAD_X 0
+#define DW_INT16XINT8_FAST_NULL_BIAS_PAD_Y 0
+#define DW_INT16XINT8_FAST_NULL_BIAS_OUTPUT_W 3
+#define DW_INT16XINT8_FAST_NULL_BIAS_OUTPUT_H 3
+#define DW_INT16XINT8_FAST_NULL_BIAS_CH_MULT 1
+#define DW_INT16XINT8_FAST_NULL_BIAS_INPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_NULL_BIAS_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_NULL_BIAS_DILATION_X 1
+#define DW_INT16XINT8_FAST_NULL_BIAS_DILATION_Y 1

+ 15 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/input_data.h

@@ -0,0 +1,15 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_null_bias_input[128] = {
+    -18577, 22082,  19015,  9939,   -9865,  -20512, -9998,  -22665, -21290, -19672, 4065,   -10617, 16410,
+    31611,  -7291,  21927,  -19420, 476,    -24395, 9624,   27330,  717,    7899,   5117,   24490,  -32507,
+    1771,   -9298,  30059,  4552,   4894,   30983,  27868,  -4336,  28377,  -32599, -28795, -18179, 14068,
+    10048,  -13372, -17813, 5292,   5836,   27257,  -17730, 22580,  30897,  24837,  31407,  -3356,  -27394,
+    23324,  -1121,  30405,  12506,  -5824,  17333,  21316,  -3119,  12142,  -15812, -8362,  13481,  8010,
+    -16409, -1384,  -9159,  -17929, -20469, 5666,   7685,   21715,  8290,   -18605, 29087,  24827,  -7327,
+    -31818, -24731, 28920,  -6734,  31443,  -2070,  -8030,  2869,   -26982, 15269,  10481,  13582,  -30426,
+    4356,   -14758, -11251, 15973,  19675,  -20685, 8198,   -22781, 13469,  -27440, 3354,   -28953, 23398,
+    24900,  25792,  -23244, -2801,  -9152,  17702,  21285,  -23864, -11231, -12826, -2178,  -23462, -9906,
+    8555,   -6807,  -357,   -23882, -32250, 29466,  18255,  15836,  -29135, -18724, -25743};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/output_mult_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_null_bias_output_mult[8] =
+    {1866128807, 2080095061, 1878397339, 1222262272, 1181868704, 1168789960, 1840511412, 1934724568};

+ 10 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/output_ref_data.h

@@ -0,0 +1,10 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_null_bias_output_ref[72] = {
+    -866,   -9369,  18025,  14936, 8948,  18291,  1225, -12887, 17485,  11598, -1880,  2217,   24936, 25780, 5322,
+    4380,   -17000, 23903,  -2885, 13396, 11649,  6528, -9735,  9763,   -599,  2135,   3433,   1923,  25943, -8111,
+    -7396,  27277,  -17000, -3610, -2397, -17000, -161, -7144,  9838,   14752, -12791, -2353,  11865, -1897, -17000,
+    -8240,  22236,  2458,   -774,  15668, -17000, 1488, -3932,  -17000, 14050, -159,   -17000, 2347,  -8647, 5208,
+    -17000, -12409, -17000, -1805, -3957, -17000, 5042, 12495,  4144,   -2766, -10838, 25524};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/output_shift_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_null_bias_output_shift[8] = {-8, -8, -8, -7, -7, -7, -8, -8};

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/test_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"

+ 7 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_null_bias/weights_data.h

@@ -0,0 +1,7 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_fast_null_bias_weights[32] = {-37, -32, 63,  13,  -123, 125,  85,  122, -127, -44, 54,
+                                                       72,  56,  127, 51,  127,  -127, 86,  127, -127, 122, -111,
+                                                       -91, -22, 8,   127, 48,   -25,  127, -47, 127,  -112};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/biases_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_fast_spill_null_bias_biases[5] = {0, 0, 0, 0, 0};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/config_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_CH 5
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_IN_CH 5
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_W 4
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_H 4
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DST_SIZE 120
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_SIZE 80
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_ACTIVATION_MIN -30000
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_BATCHES 3
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_FILTER_X 3
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_FILTER_Y 3
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_STRIDE_X 2
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_STRIDE_Y 1
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_PAD_X 0
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_PAD_Y 1
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUTPUT_W 2
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUTPUT_H 4
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_CH_MULT 1
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DILATION_X 1
+#define DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DILATION_Y 1

+ 23 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/input_data.h

@@ -0,0 +1,23 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_spill_null_bias_input[240] = {
+    -7319,  24299,  -27015, 18164,  -24128, 1886,   -6480,  31922,  -32767, 31539,  -19188, -26478, -2564,  27873,
+    -5675,  7433,   31159,  -13898, 5334,   15962,  -11900, -16708, 5430,   -9333,  -25570, 25191,  31183,  28798,
+    11724,  31452,  -16546, 31812,  -31313, -17968, -3231,  -29414, 8278,   3366,   -19478, 9861,   23600,  31414,
+    1224,   -32139, 28262,  -26215, 3336,   17011,  -27273, -20013, -28111, 20012,  6270,   -16949, -4120,  19892,
+    12244,  -27414, -25615, 18696,  -23660, -20768, 26922,  3737,   15419,  5561,   -2868,  -16641, 8907,   13550,
+    32141,  -17904, -31540, -26793, 25552,  28856,  31576,  6202,   -14888, 23762,  -8198,  9875,   -13126, -16931,
+    -9526,  -16055, 26727,  27041,  -12529, 8293,   5669,   6974,   5096,   23173,  7041,   -23140, -18321, 3989,
+    20797,  -26139, -19480, 15818,  -3253,  -32401, 14789,  -28215, 28503,  7393,   12325,  7971,   -28896, -21213,
+    -21739, 30576,  -10135, 4547,   -1594,  31627,  -7316,  -1917,  31685,  16917,  29387,  17453,  -24030, -28526,
+    -25403, 16473,  -31294, -21366, 29989,  24883,  -4928,  27241,  8780,   -1027,  -18472, -19634, 379,    12938,
+    -10436, -17286, 24274,  21095,  -11272, 19127,  -29437, 26081,  2487,   3074,   -21592, -30225, 2899,   -11512,
+    -26336, -11936, -18925, -21961, -24225, -32092, -7743,  28017,  7708,   3289,   13632,  29825,  -2397,  13927,
+    16813,  15465,  2567,   8863,   20106,  23467,  31344,  -587,   13892,  5223,   32138,  21982,  -1310,  2106,
+    -14719, 1361,   -6400,  -15697, -25569, 4561,   17766,  13162,  15011,  19040,  29142,  9900,   -5609,  20467,
+    14155,  5410,   9317,   20589,  -31716, 29891,  -18122, 11785,  31683,  -6612,  -19522, -31320, 8707,   -12412,
+    30525,  -4086,  -18752, 31711,  -10208, 21016,  -7797,  12539,  32075,  -23034, 24830,  -16196, 28507,  30444,
+    -7734,  -25719, 22082,  19411,  -29535, -21746, -26612, -21514, -23954, -20844, -28254, 17247,  -29444, 30996,
+    -6366,  18489};

+ 9 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/output_mult_data.h

@@ -0,0 +1,9 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_spill_null_bias_output_mult[5] = {1582236111,
+                                                                   1697278459,
+                                                                   1584429450,
+                                                                   1578476245,
+                                                                   1655240699};

+ 14 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/output_ref_data.h

@@ -0,0 +1,14 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_spill_null_bias_output_ref[120] = {
+    14843,  -30000, -4048,  6675,   9088,   -6050,  9733,   -13083, 3811,  5069,  12857,  20067,  -15293, -6180,
+    -6156,  8190,   -30000, 11260,  -260,   -3408,  -10865, -23340, 11763, -9607, -30000, 25650,  -12948, -28105,
+    -14116, 8150,   -26021, 9738,   -6228,  -17489, 22739,  8742,   10161, 7544,  -12009, -4853,  -6881,  -6210,
+    -386,   -9384,  12091,  -11068, -4250,  -14971, 12699,  -9181,  4662,  -6614, -4822,  9865,   -17257, 9171,
+    23259,  10158,  13321,  15819,  -23835, 12345,  1636,   13329,  7029,  -6950, -9693,  -9370,  6622,   -8985,
+    2815,   9062,   12809,  -3809,  -1580,  -5755,  21524,  -2161,  -2403, -7857, -3335,  -2979,  -355,   7144,
+    11953,  8947,   -6468,  8674,   8205,   4131,   -11566, 17473,  3105,  6242,  12932,  10898,  -4314,  -1754,
+    11727,  1264,   9262,   -11550, -11538, 32767,  -9274,  -9578,  12356, -4515, 8844,   -17155, 17149,  19417,
+    1644,   -4235,  1852,   4599,   6590,   -4039,  6022,   12918};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/output_shift_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_spill_null_bias_output_shift[5] = {-8, -8, -8, -8, -8};

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/test_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"

+ 7 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_spill_null_bias/weights_data.h

@@ -0,0 +1,7 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_fast_spill_null_bias_weights[45] = {
+    -124, 92, 127, 36, 96, 92,  -94, -57, 66, -127, 127, -52, -11, 23,   43,  -105, -110, 3,   38,  -30, 38,  20,  57,
+    58,   81, -88, 34, 13, 127, 72,  97,  21, 109,  63,  63,  95,  -127, -97, -56,  43,   -84, -75, 29,  -69, -126};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/biases_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_fast_stride_null_bias_biases[8] = {0, 0, 0, 0, 0, 0, 0, 0};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/config_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_CH 8
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_IN_CH 8
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_W 4
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_H 4
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DST_SIZE 64
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_SIZE 128
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_ACTIVATION_MIN -32768
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_ACTIVATION_MAX 16000
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_BATCHES 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_FILTER_X 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_FILTER_Y 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_STRIDE_X 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_STRIDE_Y 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_PAD_X 0
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_PAD_Y 0
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUTPUT_W 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUTPUT_H 2
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_CH_MULT 1
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DILATION_X 1
+#define DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DILATION_Y 1

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/input_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_stride_null_bias_input[256] = {
+    13363,  -27062, 13685,  -19256, 13386,  -24814, -16641, -8110,  28707,  2327,   4496,   -28017, -9455,  12014,
+    2666,   28345,  -22544, 6081,   -21473, -19897, -31912, 24589,  1586,   3373,   29895,  17061,  5695,   -28130,
+    20181,  16434,  2913,   11695,  -3875,  -29550, -3478,  -7687,  -25978, -21484, 20305,  27668,  -16821, 10588,
+    11624,  4562,   6732,   -24228, -985,   -15476, -11778, 22395,  1981,   -4412,  -25376, 27774,  5225,   -8601,
+    -6840,  10110,  -8133,  -2168,  -22628, -31249, -19991, 3760,   12116,  13388,  -18548, 15261,  -28209, 17667,
+    -8684,  -8110,  842,    14760,  -26392, -22422, 16303,  -10954, -22988, 6944,   13384,  -8538,  24785,  -7726,
+    -32061, -16167, -9307,  21149,  16388,  -8987,  -22389, 23336,  26324,  -7298,  -17595, 32324,  25170,  22512,
+    -20320, 22861,  8725,   -5506,  10775,  -32248, 8500,   2927,   11179,  -25472, -19600, -11217, -14638, 13955,
+    10356,  26762,  -29159, 22904,  -4691,  -4378,  22744,  16809,  -13861, -24730, -1753,  -14511, -17693, 536,
+    13927,  1868,   -9317,  -19828, 5988,   9475,   21101,  30475,  13260,  -281,   -16225, -26708, 20922,  -17110,
+    29639,  15847,  29534,  27105,  -24441, -5009,  -1065,  11745,  -15487, 20515,  25651,  23515,  4366,   9555,
+    -30672, -9989,  23196,  -13549, -5370,  25387,  1283,   -9193,  12193,  -24875, -12394, -24555, 21193,  2504,
+    -13562, 3829,   -9883,  3917,   -9066,  29147,  27922,  11727,  -30588, 11604,  22163,  -3817,  -31168, 29129,
+    -19606, 18210,  -6665,  -30774, -1956,  22483,  -21591, 23370,  2919,   -16802, 30963,  26431,  31245,  15148,
+    22645,  12593,  1413,   10487,  12879,  -11177, 4104,   -5148,  -12158, -8878,  13007,  16136,  9350,   -6840,
+    3851,   -22577, 2656,   -29706, -18987, 32231,  29063,  -16207, 25159,  23242,  32753,  26738,  23732,  -12515,
+    15953,  32530,  -24277, -8282,  -4189,  -32459, 25953,  -18485, 17003,  9567,   -4648,  -25883, 601,    -7937,
+    -25808, -11793, -1749,  -1667,  17639,  -9424,  -7337,  10549,  30607,  -16174, 725,    9412,   29522,  -1288,
+    -31647, 8673,   -31008, 4926};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/output_mult_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_stride_null_bias_output_mult[8] =
+    {1259187472, 1484929796, 1730993539, 1755899937, 1090673129, 1591800876, 1789240575, 1354058059};

+ 10 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/output_ref_data.h

@@ -0,0 +1,10 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_stride_null_bias_output_ref[64] = {
+    6851,   -8524, 1587,  -3803, -556,   10373,  -633,   6595,  11642,  -4598,  -253,  -2753, -12639,
+    -18827, -388,  4217,  -9823, -311,   16000,  -13879, -6339, -4188,  -12402, 4556,  1479,  410,
+    10538,  9838,  -9279, 8021,  -4391,  9789,   3323,   -3344, -15337, -4658,  -8062, -7318, 16000,
+    9947,   13069, 2319,  -1845, -12946, -13451, -10140, 2482,  5172,   -11978, 4807,  -606,  1530,
+    1551,   2971,  6140,  3655,  1978,   -2175,  -2535,  16000, -13683, 4954,   4712,  -3521};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/output_shift_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_stride_null_bias_output_shift[8] = {-8, -8, -8, -8, -8, -8, -8, -8};

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/test_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"

+ 7 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_stride_null_bias/weights_data.h

@@ -0,0 +1,7 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_fast_stride_null_bias_weights[32] = {-71, 127, -76,  -109, 6,   -127, 88,  -6,   50,  -45, -78,
+                                                              127, -72, -55,  127,  127, -95,  -41, -127, -83, 73,  -70,
+                                                              48,  -6,  -127, -78,  125, -81,  127, 17,   44,  52};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/biases_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int64_t dw_int16xint8_fast_test_bias_biases[8] =
+    {-795878, -5334291, 1236759, -5395949, 1406307, -3500813, 1515920, -1889120};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/config_data.h

@@ -0,0 +1,24 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#define DW_INT16XINT8_FAST_TEST_BIAS_OUT_CH 8
+#define DW_INT16XINT8_FAST_TEST_BIAS_IN_CH 8
+#define DW_INT16XINT8_FAST_TEST_BIAS_INPUT_W 4
+#define DW_INT16XINT8_FAST_TEST_BIAS_INPUT_H 4
+#define DW_INT16XINT8_FAST_TEST_BIAS_DST_SIZE 72
+#define DW_INT16XINT8_FAST_TEST_BIAS_INPUT_SIZE 128
+#define DW_INT16XINT8_FAST_TEST_BIAS_OUT_ACTIVATION_MIN -17000
+#define DW_INT16XINT8_FAST_TEST_BIAS_OUT_ACTIVATION_MAX 32767
+#define DW_INT16XINT8_FAST_TEST_BIAS_INPUT_BATCHES 1
+#define DW_INT16XINT8_FAST_TEST_BIAS_FILTER_X 2
+#define DW_INT16XINT8_FAST_TEST_BIAS_FILTER_Y 2
+#define DW_INT16XINT8_FAST_TEST_BIAS_STRIDE_X 1
+#define DW_INT16XINT8_FAST_TEST_BIAS_STRIDE_Y 1
+#define DW_INT16XINT8_FAST_TEST_BIAS_PAD_X 0
+#define DW_INT16XINT8_FAST_TEST_BIAS_PAD_Y 0
+#define DW_INT16XINT8_FAST_TEST_BIAS_OUTPUT_W 3
+#define DW_INT16XINT8_FAST_TEST_BIAS_OUTPUT_H 3
+#define DW_INT16XINT8_FAST_TEST_BIAS_CH_MULT 1
+#define DW_INT16XINT8_FAST_TEST_BIAS_INPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_TEST_BIAS_OUTPUT_OFFSET 0
+#define DW_INT16XINT8_FAST_TEST_BIAS_DILATION_X 1
+#define DW_INT16XINT8_FAST_TEST_BIAS_DILATION_Y 1

+ 15 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/input_data.h

@@ -0,0 +1,15 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_test_bias_input[128] = {
+    26211,  4731,   -22859, 11697,  10585,  11943,  -14100, 26914,  12318,  25865,  21309,  -30278, 16859,
+    -22419, -22520, 23316,  -12644, -21341, 12932,  27271,  -29459, 17002,  647,    30644,  15139,  24653,
+    -10887, 642,    -23018, 3393,   22354,  32020,  29579,  11325,  18381,  -20003, 31111,  12606,  19290,
+    2790,   10494,  -1174,  -11692, 24562,  -25732, 21589,  -30063, 19020,  1608,   -6749,  -17825, 10124,
+    23526,  7341,   -6651,  -1250,  -4309,  16248,  9321,   23417,  18692,  -4153,  13447,  27760,  -32343,
+    75,     -1304,  -18492, -24433, -25032, 21082,  -6482,  7326,   23924,  -15933, -2659,  16010,  23379,
+    20839,  -20544, -18458, -9760,  -12286, -19623, -22278, 830,    201,    -1896,  -10303, 2078,   30440,
+    -32037, 30788,  -3003,  -27929, 9405,   11778,  -18353, -21242, -29960, -12386, -29785, -17179, -4808,
+    11624,  -15088, 4166,   28562,  -27870, 27660,  17064,  -10298, 12385,  8103,   7981,   -20447, -11094,
+    11437,  14070,  17755,  -24940, -31608, 31029,  11698,  -27766, 30068,  -11221, -29017};

+ 6 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/output_mult_data.h

@@ -0,0 +1,6 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_test_bias_output_mult[8] =
+    {1569849899, 1183330628, 1432068229, 1089140565, 1216762566, 1610738992, 1561141528, 1090892326};

+ 10 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/output_ref_data.h

@@ -0,0 +1,10 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q15_t dw_int16xint8_fast_test_bias_output_ref[72] = {
+    -8682,  -15863, 1159,  -11937, 10286,  -9621,  -15424, -7734,  -9691, -17000, 1962,   -2115,  -11569, -4403,  4200,
+    -10421, 1505,   -3791, 5922,   -7826,  888,    -17000, 17309,  -7826, 13275,  -13579, 5015,   -1902,  -12121, -7118,
+    -76,    -1351,  -4281, -17000, 19135,  -2607,  18618,  -17000, 1027,  899,    3107,   -7082,  2951,   285,    -7891,
+    -13555, -1065,  -4924, -4919,  -7022,  13589,  -8487,  18954,  8497,  18572,  -782,   -11946, -13100, 7578,   -5467,
+    -551,   -17000, 8205,  -1650,  -10352, -12264, -10706, -10431, 20654, -4597,  -10298, -6300};

+ 5 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/output_shift_data.h

@@ -0,0 +1,5 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const int32_t dw_int16xint8_fast_test_bias_output_shift[8] = {-8, -8, -8, -9, -8, -8, -8, -8};

+ 8 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/test_data.h

@@ -0,0 +1,8 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#include "biases_data.h"
+#include "config_data.h"
+#include "input_data.h"
+#include "output_mult_data.h"
+#include "output_ref_data.h"
+#include "output_shift_data.h"
+#include "weights_data.h"

+ 7 - 0
CMSIS/NN/Tests/UnitTest/TestCases/TestData/dw_int16xint8_fast_test_bias/weights_data.h

@@ -0,0 +1,7 @@
+// Generated by generate_test_data.py using TFL version 2.9.1 as reference.
+#pragma once
+#include <stdint.h>
+
+const q7_t dw_int16xint8_fast_test_bias_weights[32] = {7,   -117, -110, -16, -102, -127, -3,   -19,  107, -1, -95,
+                                                       127, 68,   -24,  122, -18,  -127, -127, -127, 12,  -3, -49,
+                                                       -22, -127, 1,    15,  -90,  -97,  -127, 84,   127, -38};

+ 24 - 0
CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_fast_s16/Unity/unity_test_arm_depthwise_conv_fast_s16.c

@@ -53,3 +53,27 @@ void test_dw_int16xint8_fast_stride_arm_depthwise_conv_fast_s16(void)
 {
     dw_int16xint8_fast_stride_arm_depthwise_conv_fast_s16();
 }
+void test_dw_int16xint8_fast_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    dw_int16xint8_fast_null_bias_arm_depthwise_conv_fast_s16();
+}
+void test_dw_int16xint8_fast_stride_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    dw_int16xint8_fast_stride_null_bias_arm_depthwise_conv_fast_s16();
+}
+void test_dw_int16xint8_fast_spill_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    dw_int16xint8_fast_spill_null_bias_arm_depthwise_conv_fast_s16();
+}
+void test_dw_int16xint8_fast_test_bias_arm_depthwise_conv_fast_s16(void)
+{
+    dw_int16xint8_fast_test_bias_arm_depthwise_conv_fast_s16();
+}
+void test_dw_int16xint8_fast_multiple_batches_uneven_buffers_arm_depthwise_conv_fast_s16(void)
+{
+    dw_int16xint8_fast_multiple_batches_uneven_buffers_arm_depthwise_conv_fast_s16();
+}
+void test_dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_arm_depthwise_conv_fast_s16();
+}

+ 533 - 3
CMSIS/NN/Tests/UnitTest/TestCases/test_arm_depthwise_conv_fast_s16/test_arm_depthwise_conv_fast_s16.c

@@ -20,10 +20,30 @@
 #include <unity.h>
 
 #include "../TestData/dw_int16xint8_fast/test_data.h"
+#include "../TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers/test_data.h"
+#include "../TestData/dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias/test_data.h"
+#include "../TestData/dw_int16xint8_fast_null_bias/test_data.h"
 #include "../TestData/dw_int16xint8_fast_spill/test_data.h"
+#include "../TestData/dw_int16xint8_fast_spill_null_bias/test_data.h"
 #include "../TestData/dw_int16xint8_fast_stride/test_data.h"
+#include "../TestData/dw_int16xint8_fast_stride_null_bias/test_data.h"
+#include "../TestData/dw_int16xint8_fast_test_bias/test_data.h"
 #include "../Utils/validate.h"
 
+const int64_t *get_bias_s64_address(const int64_t *bias, int32_t size)
+{
+    const int64_t *return_bias = NULL;
+    for (int i = 0; i < size; i++)
+    {
+        if (bias[i] != 0)
+        {
+            return_bias = bias;
+            break;
+        }
+    }
+    return return_bias;
+}
+
 void dw_int16xint8_fast_arm_depthwise_conv_fast_s16(void)
 {
     const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
@@ -37,7 +57,7 @@ void dw_int16xint8_fast_arm_depthwise_conv_fast_s16(void)
     cmsis_nn_dims bias_dims;
     cmsis_nn_dims output_dims;
 
-    const q63_t *bias_data = dw_int16xint8_fast_biases;
+    const q63_t *bias_data = get_bias_s64_address(dw_int16xint8_fast_biases, DW_INT16XINT8_FAST_OUT_CH);
     const q15_t *input_data = dw_int16xint8_fast_input;
     const q7_t *kernel_data = dw_int16xint8_fast_weights;
     const q15_t *output_ref = dw_int16xint8_fast_output_ref;
@@ -121,7 +141,7 @@ void dw_int16xint8_fast_spill_arm_depthwise_conv_fast_s16(void)
     cmsis_nn_dims bias_dims;
     cmsis_nn_dims output_dims;
 
-    const q63_t *bias_data = dw_int16xint8_fast_spill_biases;
+    const q63_t *bias_data = get_bias_s64_address(dw_int16xint8_fast_spill_biases, DW_INT16XINT8_FAST_SPILL_OUT_CH);
     const q15_t *input_data = dw_int16xint8_fast_spill_input;
     const q7_t *kernel_data = dw_int16xint8_fast_spill_weights;
     const q15_t *output_ref = dw_int16xint8_fast_spill_output_ref;
@@ -205,7 +225,7 @@ void dw_int16xint8_fast_stride_arm_depthwise_conv_fast_s16(void)
     cmsis_nn_dims bias_dims;
     cmsis_nn_dims output_dims;
 
-    const q63_t *bias_data = dw_int16xint8_fast_stride_biases;
+    const q63_t *bias_data = get_bias_s64_address(dw_int16xint8_fast_stride_biases, DW_INT16XINT8_FAST_STRIDE_OUT_CH);
     const q15_t *input_data = dw_int16xint8_fast_stride_input;
     const q7_t *kernel_data = dw_int16xint8_fast_stride_weights;
     const q15_t *output_ref = dw_int16xint8_fast_stride_output_ref;
@@ -275,3 +295,513 @@ void dw_int16xint8_fast_stride_arm_depthwise_conv_fast_s16(void)
     TEST_ASSERT_EQUAL(expected, result);
     TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
 }
+
+void dw_int16xint8_fast_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    q15_t output[DW_INT16XINT8_FAST_NULL_BIAS_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data =
+        get_bias_s64_address(dw_int16xint8_fast_null_bias_biases, DW_INT16XINT8_FAST_NULL_BIAS_OUT_CH);
+    const q15_t *input_data = dw_int16xint8_fast_null_bias_input;
+    const q7_t *kernel_data = dw_int16xint8_fast_null_bias_weights;
+    const q15_t *output_ref = dw_int16xint8_fast_null_bias_output_ref;
+    const int32_t output_ref_size = DW_INT16XINT8_FAST_NULL_BIAS_DST_SIZE;
+
+    input_dims.n = DW_INT16XINT8_FAST_NULL_BIAS_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_FAST_NULL_BIAS_INPUT_W;
+    input_dims.h = DW_INT16XINT8_FAST_NULL_BIAS_INPUT_H;
+    input_dims.c = DW_INT16XINT8_FAST_NULL_BIAS_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FAST_NULL_BIAS_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FAST_NULL_BIAS_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_FAST_NULL_BIAS_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_FAST_NULL_BIAS_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_FAST_NULL_BIAS_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_FAST_NULL_BIAS_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_FAST_NULL_BIAS_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_FAST_NULL_BIAS_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_FAST_NULL_BIAS_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_FAST_NULL_BIAS_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_FAST_NULL_BIAS_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_FAST_NULL_BIAS_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_FAST_NULL_BIAS_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_FAST_NULL_BIAS_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_FAST_NULL_BIAS_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_FAST_NULL_BIAS_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_fast_null_bias_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_fast_null_bias_output_shift;
+
+    int buf_size = arm_depthwise_conv_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_cmsis_nn_status result = arm_depthwise_conv_fast_s16(&ctx,
+                                                             &dw_conv_params,
+                                                             &quant_params,
+                                                             &input_dims,
+                                                             input_data,
+                                                             &filter_dims,
+                                                             kernel_data,
+                                                             &bias_dims,
+                                                             bias_data,
+                                                             &output_dims,
+                                                             output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_depthwise_conv_wrapper_s16_get_buffer_size(&dw_conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_depthwise_conv_wrapper_s16(&ctx,
+                                            &dw_conv_params,
+                                            &quant_params,
+                                            &input_dims,
+                                            input_data,
+                                            &filter_dims,
+                                            kernel_data,
+                                            &bias_dims,
+                                            bias_data,
+                                            &output_dims,
+                                            output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+
+void dw_int16xint8_fast_stride_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    q15_t output[DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data =
+        get_bias_s64_address(dw_int16xint8_fast_stride_null_bias_biases, DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_CH);
+    const q15_t *input_data = dw_int16xint8_fast_stride_null_bias_input;
+    const q7_t *kernel_data = dw_int16xint8_fast_stride_null_bias_weights;
+    const q15_t *output_ref = dw_int16xint8_fast_stride_null_bias_output_ref;
+    const int32_t output_ref_size = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DST_SIZE;
+
+    input_dims.n = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_W;
+    input_dims.h = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_H;
+    input_dims.c = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_FAST_STRIDE_NULL_BIAS_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_fast_stride_null_bias_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_fast_stride_null_bias_output_shift;
+
+    int buf_size = arm_depthwise_conv_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_cmsis_nn_status result = arm_depthwise_conv_fast_s16(&ctx,
+                                                             &dw_conv_params,
+                                                             &quant_params,
+                                                             &input_dims,
+                                                             input_data,
+                                                             &filter_dims,
+                                                             kernel_data,
+                                                             &bias_dims,
+                                                             bias_data,
+                                                             &output_dims,
+                                                             output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_depthwise_conv_wrapper_s16_get_buffer_size(&dw_conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_depthwise_conv_wrapper_s16(&ctx,
+                                            &dw_conv_params,
+                                            &quant_params,
+                                            &input_dims,
+                                            input_data,
+                                            &filter_dims,
+                                            kernel_data,
+                                            &bias_dims,
+                                            bias_data,
+                                            &output_dims,
+                                            output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+
+void dw_int16xint8_fast_spill_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    q15_t output[DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data =
+        get_bias_s64_address(dw_int16xint8_fast_spill_null_bias_biases, DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_CH);
+    const q15_t *input_data = dw_int16xint8_fast_spill_null_bias_input;
+    const q7_t *kernel_data = dw_int16xint8_fast_spill_null_bias_weights;
+    const q15_t *output_ref = dw_int16xint8_fast_spill_null_bias_output_ref;
+    const int32_t output_ref_size = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DST_SIZE;
+
+    input_dims.n = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_W;
+    input_dims.h = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_H;
+    input_dims.c = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_FAST_SPILL_NULL_BIAS_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_fast_spill_null_bias_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_fast_spill_null_bias_output_shift;
+
+    int buf_size = arm_depthwise_conv_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_cmsis_nn_status result = arm_depthwise_conv_fast_s16(&ctx,
+                                                             &dw_conv_params,
+                                                             &quant_params,
+                                                             &input_dims,
+                                                             input_data,
+                                                             &filter_dims,
+                                                             kernel_data,
+                                                             &bias_dims,
+                                                             bias_data,
+                                                             &output_dims,
+                                                             output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_depthwise_conv_wrapper_s16_get_buffer_size(&dw_conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_depthwise_conv_wrapper_s16(&ctx,
+                                            &dw_conv_params,
+                                            &quant_params,
+                                            &input_dims,
+                                            input_data,
+                                            &filter_dims,
+                                            kernel_data,
+                                            &bias_dims,
+                                            bias_data,
+                                            &output_dims,
+                                            output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+
+void dw_int16xint8_fast_test_bias_arm_depthwise_conv_fast_s16(void)
+{
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    q15_t output[DW_INT16XINT8_FAST_TEST_BIAS_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data =
+        get_bias_s64_address(dw_int16xint8_fast_test_bias_biases, DW_INT16XINT8_FAST_TEST_BIAS_OUT_CH);
+    const q15_t *input_data = dw_int16xint8_fast_test_bias_input;
+    const q7_t *kernel_data = dw_int16xint8_fast_test_bias_weights;
+    const q15_t *output_ref = dw_int16xint8_fast_test_bias_output_ref;
+    const int32_t output_ref_size = DW_INT16XINT8_FAST_TEST_BIAS_DST_SIZE;
+
+    input_dims.n = DW_INT16XINT8_FAST_TEST_BIAS_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_FAST_TEST_BIAS_INPUT_W;
+    input_dims.h = DW_INT16XINT8_FAST_TEST_BIAS_INPUT_H;
+    input_dims.c = DW_INT16XINT8_FAST_TEST_BIAS_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FAST_TEST_BIAS_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FAST_TEST_BIAS_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_FAST_TEST_BIAS_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_FAST_TEST_BIAS_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_FAST_TEST_BIAS_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_FAST_TEST_BIAS_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_FAST_TEST_BIAS_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_FAST_TEST_BIAS_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_FAST_TEST_BIAS_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_FAST_TEST_BIAS_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_FAST_TEST_BIAS_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_FAST_TEST_BIAS_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_FAST_TEST_BIAS_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_FAST_TEST_BIAS_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_FAST_TEST_BIAS_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_FAST_TEST_BIAS_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_fast_test_bias_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_fast_test_bias_output_shift;
+
+    int buf_size = arm_depthwise_conv_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_cmsis_nn_status result = arm_depthwise_conv_fast_s16(&ctx,
+                                                             &dw_conv_params,
+                                                             &quant_params,
+                                                             &input_dims,
+                                                             input_data,
+                                                             &filter_dims,
+                                                             kernel_data,
+                                                             &bias_dims,
+                                                             bias_data,
+                                                             &output_dims,
+                                                             output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_depthwise_conv_wrapper_s16_get_buffer_size(&dw_conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_depthwise_conv_wrapper_s16(&ctx,
+                                            &dw_conv_params,
+                                            &quant_params,
+                                            &input_dims,
+                                            input_data,
+                                            &filter_dims,
+                                            kernel_data,
+                                            &bias_dims,
+                                            bias_data,
+                                            &output_dims,
+                                            output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+
+void dw_int16xint8_fast_multiple_batches_uneven_buffers_arm_depthwise_conv_fast_s16(void)
+{
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    q15_t output[DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = get_bias_s64_address(dw_int16xint8_fast_multiple_batches_uneven_buffers_biases,
+                                                  DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_CH);
+    const q15_t *input_data = dw_int16xint8_fast_multiple_batches_uneven_buffers_input;
+    const q7_t *kernel_data = dw_int16xint8_fast_multiple_batches_uneven_buffers_weights;
+    const q15_t *output_ref = dw_int16xint8_fast_multiple_batches_uneven_buffers_output_ref;
+    const int32_t output_ref_size = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DST_SIZE;
+
+    input_dims.n = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_W;
+    input_dims.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_H;
+    input_dims.c = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_fast_multiple_batches_uneven_buffers_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_fast_multiple_batches_uneven_buffers_output_shift;
+
+    int buf_size = arm_depthwise_conv_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_cmsis_nn_status result = arm_depthwise_conv_fast_s16(&ctx,
+                                                             &dw_conv_params,
+                                                             &quant_params,
+                                                             &input_dims,
+                                                             input_data,
+                                                             &filter_dims,
+                                                             kernel_data,
+                                                             &bias_dims,
+                                                             bias_data,
+                                                             &output_dims,
+                                                             output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_depthwise_conv_wrapper_s16_get_buffer_size(&dw_conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_depthwise_conv_wrapper_s16(&ctx,
+                                            &dw_conv_params,
+                                            &quant_params,
+                                            &input_dims,
+                                            input_data,
+                                            &filter_dims,
+                                            kernel_data,
+                                            &bias_dims,
+                                            bias_data,
+                                            &output_dims,
+                                            output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}
+
+void dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_arm_depthwise_conv_fast_s16(void)
+{
+    const arm_cmsis_nn_status expected = ARM_CMSIS_NN_SUCCESS;
+    q15_t output[DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DST_SIZE] = {0};
+
+    cmsis_nn_context ctx;
+    cmsis_nn_dw_conv_params dw_conv_params;
+    cmsis_nn_per_channel_quant_params quant_params;
+    cmsis_nn_dims input_dims;
+    cmsis_nn_dims filter_dims;
+    cmsis_nn_dims bias_dims;
+    cmsis_nn_dims output_dims;
+
+    const q63_t *bias_data = get_bias_s64_address(dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_biases,
+                                                  DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_CH);
+    const q15_t *input_data = dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_input;
+    const q7_t *kernel_data = dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_weights;
+    const q15_t *output_ref = dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_output_ref;
+    const int32_t output_ref_size = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DST_SIZE;
+
+    input_dims.n = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_BATCHES;
+    input_dims.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_W;
+    input_dims.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_H;
+    input_dims.c = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_IN_CH;
+    filter_dims.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_FILTER_X;
+    filter_dims.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_FILTER_Y;
+    output_dims.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUTPUT_W;
+    output_dims.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUTPUT_H;
+    output_dims.c = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_CH;
+
+    dw_conv_params.padding.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_PAD_X;
+    dw_conv_params.padding.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_PAD_Y;
+    dw_conv_params.stride.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_STRIDE_X;
+    dw_conv_params.stride.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_STRIDE_Y;
+    dw_conv_params.dilation.w = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DILATION_X;
+    dw_conv_params.dilation.h = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_DILATION_Y;
+
+    dw_conv_params.ch_mult = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_CH_MULT;
+
+    dw_conv_params.input_offset = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_INPUT_OFFSET;
+    dw_conv_params.output_offset = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUTPUT_OFFSET;
+    dw_conv_params.activation.min = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_ACTIVATION_MIN;
+    dw_conv_params.activation.max = DW_INT16XINT8_FAST_MULTIPLE_BATCHES_UNEVEN_BUFFERS_NULL_BIAS_OUT_ACTIVATION_MAX;
+    quant_params.multiplier = (int32_t *)dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_output_mult;
+    quant_params.shift = (int32_t *)dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias_output_shift;
+
+    int buf_size = arm_depthwise_conv_fast_s16_get_buffer_size(&input_dims, &filter_dims);
+    ctx.buf = malloc(buf_size);
+
+    arm_cmsis_nn_status result = arm_depthwise_conv_fast_s16(&ctx,
+                                                             &dw_conv_params,
+                                                             &quant_params,
+                                                             &input_dims,
+                                                             input_data,
+                                                             &filter_dims,
+                                                             kernel_data,
+                                                             &bias_dims,
+                                                             bias_data,
+                                                             &output_dims,
+                                                             output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+
+    buf_size = arm_depthwise_conv_wrapper_s16_get_buffer_size(&dw_conv_params, &input_dims, &filter_dims, &output_dims);
+    ctx.buf = malloc(buf_size);
+
+    result = arm_depthwise_conv_wrapper_s16(&ctx,
+                                            &dw_conv_params,
+                                            &quant_params,
+                                            &input_dims,
+                                            input_data,
+                                            &filter_dims,
+                                            kernel_data,
+                                            &bias_dims,
+                                            bias_data,
+                                            &output_dims,
+                                            output);
+
+    free(ctx.buf);
+    TEST_ASSERT_EQUAL(expected, result);
+    TEST_ASSERT_TRUE(validate_s16(output, output_ref, output_ref_size));
+}

+ 34 - 0
CMSIS/NN/Tests/UnitTest/generate_test_data.py

@@ -1298,16 +1298,50 @@ def load_all_testdatasets():
                                           w_y=2, stride_x=1, stride_y=1, pad=False, randmin=INT16_MIN,
                                           randmax=INT16_MAX, out_activation_min=-17000,
                                           out_activation_max=32767, int16xint8=True)
+    dataset = 'dw_int16xint8_fast_multiple_batches_uneven_buffers'
+    TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=8, out_ch=8, x_in=5, y_in=5, w_x=3,
+                                          w_y=3, stride_x=1, stride_y=1, pad=False, randmin=INT16_MIN,
+                                          randmax=INT16_MAX, out_activation_min=-17000,
+                                          out_activation_max=32767, int16xint8=True, batches=3)
+    dataset = 'dw_int16xint8_fast_multiple_batches_uneven_buffers_null_bias'
+    TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=8, out_ch=8, x_in=4, y_in=4, w_x=3,
+                                          w_y=2, stride_x=1, stride_y=1, pad=False, randmin=INT16_MIN,
+                                          randmax=INT16_MAX, out_activation_min=-17000,
+                                          out_activation_max=32767, int16xint8=True, batches=3, generate_bias=False)
+
+    dataset = 'dw_int16xint8_fast_test_bias'
+    nbr_of_out_channels = 8;
+    bias=[i for i in range(nbr_of_out_channels)];
+    TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=8, out_ch=nbr_of_out_channels, x_in=4, y_in=4, w_x=2,
+                                          w_y=2, stride_x=1, stride_y=1, pad=False, randmin=INT16_MIN,
+                                          randmax=INT16_MAX, out_activation_min=-17000,
+                                          out_activation_max=32767, int16xint8=True, generate_bias=bias)
+
+    dataset = 'dw_int16xint8_fast_null_bias'
+    TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=8, out_ch=8, x_in=4, y_in=4, w_x=2,
+                                          w_y=2, stride_x=1, stride_y=1, pad=False, randmin=INT16_MIN,
+                                          randmax=INT16_MAX, out_activation_min=-17000,
+                                          out_activation_max=32767, int16xint8=True, generate_bias=False)
     dataset = 'dw_int16xint8_fast_stride'
     TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=8, out_ch=8, x_in=4, y_in=4, w_x=2,
                                           w_y=2, stride_x=2, stride_y=2, pad=True, randmin=INT16_MIN,
                                           randmax=INT16_MAX, batches=2, out_activation_min=INT16_MIN,
                                           out_activation_max=16000, int16xint8=True)
+    dataset = 'dw_int16xint8_fast_stride_null_bias'
+    TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=8, out_ch=8, x_in=4, y_in=4, w_x=2,
+                                          w_y=2, stride_x=2, stride_y=2, pad=True, randmin=INT16_MIN,
+                                          randmax=INT16_MAX, batches=2, out_activation_min=INT16_MIN,
+                                          out_activation_max=16000, int16xint8=True, generate_bias=False)
     dataset = 'dw_int16xint8_fast_spill'
     TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=5, out_ch=5, x_in=4, y_in=4, w_x=3,
                                           w_y=3, stride_x=2, stride_y=1, pad=True, randmin=INT16_MIN,
                                           randmax=INT16_MAX, batches=3, out_activation_min=-30000,
                                           out_activation_max=32767, int16xint8=True)
+    dataset = 'dw_int16xint8_fast_spill_null_bias'
+    TESTDATA_SETS[dataset] = ConvSettings(dataset, type_of_test, args, in_ch=5, out_ch=5, x_in=4, y_in=4, w_x=3,
+                                          w_y=3, stride_x=2, stride_y=1, pad=True, randmin=INT16_MIN,
+                                          randmax=INT16_MAX, batches=3, out_activation_min=-30000,
+                                          out_activation_max=32767, int16xint8=True, generate_bias=False)
 
     type_of_test = 'fully_connected'
     dataset = 'fully_connected'