|
|
@@ -1,5 +1,5 @@
|
|
|
/*
|
|
|
- * Copyright (C) 2010-2020 Arm Limited or its affiliates. All rights reserved.
|
|
|
+ * Copyright (C) 2010-2021 Arm Limited or its affiliates.
|
|
|
*
|
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
|
*
|
|
|
@@ -21,8 +21,8 @@
|
|
|
* Title: arm_nn_mat_mult_s8.c
|
|
|
* Description: General Matrix-multiplication function
|
|
|
*
|
|
|
- * $Date: 09. October 2020
|
|
|
- * $Revision: V.2.0.5
|
|
|
+ * $Date: 27. October 2021
|
|
|
+ * $Revision: V.2.0.6
|
|
|
*
|
|
|
* Target Processor: Cortex-M cores
|
|
|
* -------------------------------------------------------------------- */
|
|
|
@@ -76,24 +76,24 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
|
|
|
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
|
|
|
row_len_tmp -= 8;
|
|
|
|
|
|
- int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
|
|
|
- ip_r0 += 8;
|
|
|
-
|
|
|
- int16x8_t c0 = vldrbq_z_s16(ip_c0, p);
|
|
|
+ int16x8_t c0 = vldrbq_s16(ip_c0);
|
|
|
ip_c0 += 8;
|
|
|
- c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p);
|
|
|
+ c0 = vaddq_s16(c0, offset);
|
|
|
|
|
|
- int16x8_t c1 = vldrbq_z_s16(ip_c1, p);
|
|
|
+ int16x8_t c1 = vldrbq_s16(ip_c1);
|
|
|
ip_c1 += 8;
|
|
|
- c1 = vaddq_m_s16(vuninitializedq_s16(), c1, offset, p);
|
|
|
+ c1 = vaddq_s16(c1, offset);
|
|
|
|
|
|
- int16x8_t c2 = vldrbq_z_s16(ip_c2, p);
|
|
|
+ int16x8_t c2 = vldrbq_s16(ip_c2);
|
|
|
ip_c2 += 8;
|
|
|
- c2 = vaddq_m_s16(vuninitializedq_s16(), c2, offset, p);
|
|
|
+ c2 = vaddq_s16(c2, offset);
|
|
|
|
|
|
- int16x8_t c3 = vldrbq_z_s16(ip_c3, p);
|
|
|
+ int16x8_t c3 = vldrbq_s16(ip_c3);
|
|
|
ip_c3 += 8;
|
|
|
- c3 = vaddq_m_s16(vuninitializedq_s16(), c3, offset, p);
|
|
|
+ c3 = vaddq_s16(c3, offset);
|
|
|
+
|
|
|
+ int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
|
|
|
+ ip_r0 += 8;
|
|
|
|
|
|
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
|
|
|
acc_1 = vmladavaq_p_s16(acc_1, r0, c1, p);
|
|
|
@@ -136,12 +136,12 @@ q7_t *arm_nn_mat_mult_s8(const q7_t *input_row,
|
|
|
const int16x8_t offset = vdupq_m_n_s16(vuninitializedq_s16(), col_offset, p);
|
|
|
row_len_tmp -= 8;
|
|
|
|
|
|
- int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
|
|
|
- ip_r0 += 8;
|
|
|
- int16x8_t c0 = vldrbq_z_s16(ip_c0, p);
|
|
|
+ int16x8_t c0 = vldrbq_s16(ip_c0);
|
|
|
ip_c0 += 8;
|
|
|
+ c0 = vaddq_s16(c0, offset);
|
|
|
|
|
|
- c0 = vaddq_m_s16(vuninitializedq_s16(), c0, offset, p);
|
|
|
+ int16x8_t r0 = vldrbq_z_s16(ip_r0, p);
|
|
|
+ ip_r0 += 8;
|
|
|
acc_0 = vmladavaq_p_s16(acc_0, r0, c0, p);
|
|
|
}
|
|
|
|