| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640 |
- /******************************************************************************
- * @file matrix_utils.h
- * @brief Public header file for CMSIS DSP Library
- * @version V1.11.0
- * @date 30 May 2022
- * Target Processor: Cortex-M and Cortex-A cores
- ******************************************************************************/
- /*
- * Copyright (c) 2010-2022 Arm Limited or its affiliates. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #ifndef _MATRIX_UTILS_H_
- #define _MATRIX_UTILS_H_
- #include "arm_math_types.h"
- #include "arm_math_memory.h"
- #include "dsp/none.h"
- #include "dsp/utils.h"
- #ifdef __cplusplus
- extern "C"
- {
- #endif
- #define ELEM(A,ROW,COL) &((A)->pData[(A)->numCols* (ROW) + (COL)])
- #define SCALE_COL_T(T,CAST,A,ROW,v,i) \
- { \
- int32_t _w; \
- T *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- const int32_t nb = (A)->numRows - ROW;\
- \
- data += i + _numCols * (ROW); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *data *= CAST v; \
- data += _numCols; \
- } \
- }
- #define COPY_COL_T(T,A,ROW,COL,DST) \
- { \
- uint32_t _row; \
- T *_pb=DST; \
- T *_pa = (A)->pData + ROW * (A)->numCols + COL;\
- for(_row = ROW; _row < (A)->numRows; _row ++) \
- { \
- *_pb++ = *_pa; \
- _pa += (A)->numCols; \
- } \
- }
- #if defined(ARM_FLOAT16_SUPPORTED)
- #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
- #define SWAP_ROWS_F16(A,COL,i,j) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- int32_t _w; \
- float16_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- \
- for(_w=(COL);_w < _numCols; _w+=8) \
- { \
- f16x8_t tmpa,tmpb; \
- mve_pred16_t p0 = vctp16q(cnt); \
- \
- tmpa=vldrhq_z_f16(&data[i*_numCols + _w],p0);\
- tmpb=vldrhq_z_f16(&data[j*_numCols + _w],p0);\
- \
- vstrhq_p(&data[i*_numCols + _w], tmpb, p0); \
- vstrhq_p(&data[j*_numCols + _w], tmpa, p0); \
- \
- cnt -= 8; \
- } \
- }
- #define SCALE_ROW_F16(A,COL,v,i) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- int32_t _w; \
- float16_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- \
- for(_w=(COL);_w < _numCols; _w+=8) \
- { \
- f16x8_t tmpa; \
- mve_pred16_t p0 = vctp16q(cnt); \
- tmpa = vldrhq_z_f16(&data[i*_numCols + _w],p0);\
- tmpa = vmulq_n_f16(tmpa,(_Float16)v); \
- vstrhq_p(&data[i*_numCols + _w], tmpa, p0); \
- cnt -= 8; \
- } \
- \
- }
- #define MAC_ROW_F16(COL,A,i,v,B,j) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- int32_t _w; \
- float16_t *dataA = (A)->pData; \
- float16_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols; \
- \
- for(_w=(COL);_w < _numCols; _w+=8) \
- { \
- f16x8_t tmpa,tmpb; \
- mve_pred16_t p0 = vctp16q(cnt); \
- tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
- tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
- tmpa = vfmaq_n_f16(tmpa,tmpb,v); \
- vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0); \
- cnt -= 8; \
- } \
- \
- }
- #define MAS_ROW_F16(COL,A,i,v,B,j) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- int32_t _w; \
- float16_t *dataA = (A)->pData; \
- float16_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols; \
- f16x8_t vec=vdupq_n_f16(v); \
- \
- for(_w=(COL);_w < _numCols; _w+=8) \
- { \
- f16x8_t tmpa,tmpb; \
- mve_pred16_t p0 = vctp16q(cnt); \
- tmpa = vldrhq_z_f16(&dataA[i*_numCols + _w],p0);\
- tmpb = vldrhq_z_f16(&dataB[j*_numCols + _w],p0);\
- tmpa = vfmsq_f16(tmpa,tmpb,vec); \
- vstrhq_p(&dataA[i*_numCols + _w], tmpa, p0); \
- cnt -= 8; \
- } \
- \
- }
- #else
- #define SWAP_ROWS_F16(A,COL,i,j) \
- { \
- int32_t _w; \
- float16_t *dataI = (A)->pData; \
- float16_t *dataJ = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- dataI += i*_numCols + (COL); \
- dataJ += j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- float16_t tmp; \
- tmp = *dataI; \
- *dataI++ = *dataJ; \
- *dataJ++ = tmp; \
- } \
- }
- #define SCALE_ROW_F16(A,COL,v,i) \
- { \
- int32_t _w; \
- float16_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- data += i*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *data++ *= (_Float16)v; \
- } \
- }
- #define MAC_ROW_F16(COL,A,i,v,B,j) \
- { \
- int32_t _w; \
- float16_t *dataA = (A)->pData; \
- float16_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols; \
- const int32_t nb = _numCols-(COL); \
- \
- dataA += i*_numCols + (COL); \
- dataB += j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *dataA++ += (_Float16)v * (_Float16)*dataB++;\
- } \
- }
- #define MAS_ROW_F16(COL,A,i,v,B,j) \
- { \
- int32_t _w; \
- float16_t *dataA = (A)->pData; \
- float16_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols; \
- const int32_t nb = _numCols-(COL); \
- \
- dataA += i*_numCols + (COL); \
- dataB += j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
- } \
- }
- #endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/
- /* Functions with only a scalar version */
- #define COPY_COL_F16(A,ROW,COL,DST) \
- COPY_COL_T(float16_t,A,ROW,COL,DST)
- #define SCALE_COL_F16(A,ROW,v,i) \
- SCALE_COL_T(float16_t,(_Float16),A,ROW,v,i)
-
- #endif /* defined(ARM_FLOAT16_SUPPORTED)*/
- #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
- #define SWAP_ROWS_F32(A,COL,i,j) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- float32_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- int32_t _w; \
- \
- for(_w=(COL);_w < _numCols; _w+=4) \
- { \
- f32x4_t tmpa,tmpb; \
- mve_pred16_t p0 = vctp32q(cnt); \
- \
- tmpa=vldrwq_z_f32(&data[i*_numCols + _w],p0);\
- tmpb=vldrwq_z_f32(&data[j*_numCols + _w],p0);\
- \
- vstrwq_p(&data[i*_numCols + _w], tmpb, p0); \
- vstrwq_p(&data[j*_numCols + _w], tmpa, p0); \
- \
- cnt -= 4; \
- } \
- }
- #define MAC_ROW_F32(COL,A,i,v,B,j) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- float32_t *dataA = (A)->pData; \
- float32_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols; \
- int32_t _w; \
- \
- for(_w=(COL);_w < _numCols; _w+=4) \
- { \
- f32x4_t tmpa,tmpb; \
- mve_pred16_t p0 = vctp32q(cnt); \
- tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
- tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
- tmpa = vfmaq_n_f32(tmpa,tmpb,v); \
- vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0); \
- cnt -= 4; \
- } \
- \
- }
- #define MAS_ROW_F32(COL,A,i,v,B,j) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- float32_t *dataA = (A)->pData; \
- float32_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols; \
- int32_t _w; \
- f32x4_t vec=vdupq_n_f32(v); \
- \
- for(_w=(COL);_w < _numCols; _w+=4) \
- { \
- f32x4_t tmpa,tmpb; \
- mve_pred16_t p0 = vctp32q(cnt); \
- tmpa = vldrwq_z_f32(&dataA[i*_numCols + _w],p0);\
- tmpb = vldrwq_z_f32(&dataB[j*_numCols + _w],p0);\
- tmpa = vfmsq_f32(tmpa,tmpb,vec); \
- vstrwq_p(&dataA[i*_numCols + _w], tmpa, p0); \
- cnt -= 4; \
- } \
- \
- }
- #define SCALE_ROW_F32(A,COL,v,i) \
- { \
- int cnt = ((A)->numCols)-(COL); \
- float32_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- int32_t _w; \
- \
- for(_w=(COL);_w < _numCols; _w+=4) \
- { \
- f32x4_t tmpa; \
- mve_pred16_t p0 = vctp32q(cnt); \
- tmpa = vldrwq_z_f32(&data[i*_numCols + _w],p0);\
- tmpa = vmulq_n_f32(tmpa,v); \
- vstrwq_p(&data[i*_numCols + _w], tmpa, p0); \
- cnt -= 4; \
- } \
- \
- }
- #elif defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
- #define SWAP_ROWS_F32(A,COL,i,j) \
- { \
- int32_t _w; \
- float32_t *dataI = (A)->pData; \
- float32_t *dataJ = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols - COL; \
- \
- dataI += i*_numCols + (COL); \
- dataJ += j*_numCols + (COL); \
- \
- float32_t tmp; \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- tmp = *dataI; \
- *dataI++ = *dataJ; \
- *dataJ++ = tmp; \
- } \
- }
- #define MAC_ROW_F32(COL,A,i,v,B,j) \
- { \
- float32_t *dataA = (A)->pData; \
- float32_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols - (COL); \
- int32_t nbElems; \
- f32x4_t vec = vdupq_n_f32(v); \
- \
- nbElems = nb >> 2; \
- \
- dataA += i*_numCols + (COL); \
- dataB += j*_numCols + (COL); \
- \
- while(nbElems>0) \
- { \
- f32x4_t tmpa,tmpb; \
- tmpa = vld1q_f32(dataA,p0); \
- tmpb = vld1q_f32(dataB,p0); \
- tmpa = vmlaq_f32(tmpa,tmpb,vec);\
- vst1q_f32(dataA, tmpa, p0); \
- nbElems--; \
- dataA += 4; \
- dataB += 4; \
- } \
- \
- nbElems = nb & 3; \
- while(nbElems > 0) \
- { \
- *dataA++ += v* *dataB++; \
- nbElems--; \
- } \
- }
- #define MAS_ROW_F32(COL,A,i,v,B,j) \
- { \
- float32_t *dataA = (A)->pData; \
- float32_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols - (COL); \
- int32_t nbElems; \
- f32x4_t vec = vdupq_n_f32(v); \
- \
- nbElems = nb >> 2; \
- \
- dataA += i*_numCols + (COL); \
- dataB += j*_numCols + (COL); \
- \
- while(nbElems>0) \
- { \
- f32x4_t tmpa,tmpb; \
- tmpa = vld1q_f32(dataA); \
- tmpb = vld1q_f32(dataB); \
- tmpa = vmlsq_f32(tmpa,tmpb,vec);\
- vst1q_f32(dataA, tmpa); \
- nbElems--; \
- dataA += 4; \
- dataB += 4; \
- } \
- \
- nbElems = nb & 3; \
- while(nbElems > 0) \
- { \
- *dataA++ -= v* *dataB++; \
- nbElems--; \
- } \
- }
- #define SCALE_ROW_F32(A,COL,v,i) \
- { \
- float32_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- const int32_t nb = _numCols - (COL); \
- int32_t nbElems; \
- f32x4_t vec = vdupq_n_f32(v); \
- \
- nbElems = nb >> 2; \
- \
- data += i*_numCols + (COL); \
- while(nbElems>0) \
- { \
- f32x4_t tmpa; \
- tmpa = vld1q_f32(data); \
- tmpa = vmulq_f32(tmpa,vec); \
- vst1q_f32(data, tmpa); \
- data += 4; \
- nbElems --; \
- } \
- \
- nbElems = nb & 3; \
- while(nbElems > 0) \
- { \
- *data++ *= v; \
- nbElems--; \
- } \
- \
- }
- #else
- #define SWAP_ROWS_F32(A,COL,i,j) \
- { \
- int32_t _w; \
- float32_t tmp; \
- float32_t *dataI = (A)->pData; \
- float32_t *dataJ = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols - COL; \
- \
- dataI += i*_numCols + (COL); \
- dataJ += j*_numCols + (COL); \
- \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- tmp = *dataI; \
- *dataI++ = *dataJ; \
- *dataJ++ = tmp; \
- } \
- }
- #define SCALE_ROW_F32(A,COL,v,i) \
- { \
- int32_t _w; \
- float32_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols - COL; \
- \
- data += i*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *data++ *= v; \
- } \
- }
- #define MAC_ROW_F32(COL,A,i,v,B,j) \
- { \
- int32_t _w; \
- float32_t *dataA = (A)->pData; \
- float32_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- dataA = dataA + i*_numCols + (COL); \
- dataB = dataB + j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *dataA++ += v* *dataB++; \
- } \
- }
- #define MAS_ROW_F32(COL,A,i,v,B,j) \
- { \
- int32_t _w; \
- float32_t *dataA = (A)->pData; \
- float32_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- dataA = dataA + i*_numCols + (COL); \
- dataB = dataB + j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *dataA++ -= v* *dataB++; \
- } \
- }
- #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
- /* Functions _with only a scalar version */
- #define COPY_COL_F32(A,ROW,COL,DST) \
- COPY_COL_T(float32_t,A,ROW,COL,DST)
- #define COPY_COL_F64(A,ROW,COL,DST) \
- COPY_COL_T(float64_t,A,ROW,COL,DST)
- #define SWAP_COLS_F32(A,COL,i,j) \
- { \
- int32_t _w; \
- float32_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- for(_w=(COL);_w < _numCols; _w++) \
- { \
- float32_t tmp; \
- tmp = data[_w*_numCols + i]; \
- data[_w*_numCols + i] = data[_w*_numCols + j];\
- data[_w*_numCols + j] = tmp; \
- } \
- }
- #define SCALE_COL_F32(A,ROW,v,i) \
- SCALE_COL_T(float32_t,,A,ROW,v,i)
- #define SWAP_ROWS_F64(A,COL,i,j) \
- { \
- int32_t _w; \
- float64_t *dataI = (A)->pData; \
- float64_t *dataJ = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- dataI += i*_numCols + (COL); \
- dataJ += j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- float64_t tmp; \
- tmp = *dataI; \
- *dataI++ = *dataJ; \
- *dataJ++ = tmp; \
- } \
- }
- #define SWAP_COLS_F64(A,COL,i,j) \
- { \
- int32_t _w; \
- float64_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols; \
- for(_w=(COL);_w < _numCols; _w++) \
- { \
- float64_t tmp; \
- tmp = data[_w*_numCols + i]; \
- data[_w*_numCols + i] = data[_w*_numCols + j];\
- data[_w*_numCols + j] = tmp; \
- } \
- }
- #define SCALE_ROW_F64(A,COL,v,i) \
- { \
- int32_t _w; \
- float64_t *data = (A)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- data += i*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *data++ *= v; \
- } \
- }
- #define SCALE_COL_F64(A,ROW,v,i) \
- SCALE_COL_T(float64_t,,A,ROW,v,i)
- #define MAC_ROW_F64(COL,A,i,v,B,j) \
- { \
- int32_t _w; \
- float64_t *dataA = (A)->pData; \
- float64_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- dataA += i*_numCols + (COL); \
- dataB += j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *dataA++ += v* *dataB++; \
- } \
- }
- #define MAS_ROW_F64(COL,A,i,v,B,j) \
- { \
- int32_t _w; \
- float64_t *dataA = (A)->pData; \
- float64_t *dataB = (B)->pData; \
- const int32_t _numCols = (A)->numCols;\
- const int32_t nb = _numCols-(COL); \
- \
- dataA += i*_numCols + (COL); \
- dataB += j*_numCols + (COL); \
- \
- for(_w=0;_w < nb; _w++) \
- { \
- *dataA++ -= v* *dataB++; \
- } \
- }
- #ifdef __cplusplus
- }
- #endif
- #endif /* ifndef _MATRIX_UTILS_H_ */
|