arm_nnfunctions.h 102 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097
  1. /*
  2. * Copyright (C) 2010-2021 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_nnfunctions.h
  21. * Description: Public header file for CMSIS NN Library
  22. *
  23. * $Date: 19 March 2021
  24. * $Revision: V.7.0.0
  25. *
  26. * Target Processor: Cortex-M CPUs
  27. * -------------------------------------------------------------------- */
  28. /**
  29. \mainpage CMSIS NN Software Library
  30. *
  31. * Introduction
  32. * ------------
  33. *
  34. * This user manual describes the CMSIS NN software library,
  35. * a collection of efficient neural network kernels developed to maximize the
  36. * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
  37. *
  38. * The library is divided into a number of functions each covering a specific category:
  39. * - Convolution Functions
  40. * - Activation Functions
  41. * - Fully-connected Layer Functions
  42. * - SVDF Layer Functions
  43. * - Pooling Functions
  44. * - Softmax Functions
  45. * - Basic math Functions
  46. *
  47. * The library has separate functions for operating on different weight and activation data
  48. * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
  49. * kernels are included in the function description. The implementation details are also
  50. * described in this paper [1].
  51. *
  52. * Function Classification
  53. * --------
  54. * The functions can be classified into two segments
  55. * - Legacy functions supporting ARM's internal symmetric quantization(8 bits).
  56. * - Functions that support TensorFlow Lite framework with symmetric quantization(8 bits).
  57. *
  58. * The legacy functions can be identified with their suffix of _q7 or _q15 and are no new development is done there.
  59. * The article in [2] describes in detail how to run a network using the legacy functions.
  60. *
  61. * The functions supporting TensorFlow Lite framework is identified by the _s8 suffix and can be invoked from TFL
  62. * micro. The functions are bit exact to TensorFlow Lite. Refer to the TensorFlow's documentation in [3] on how to run
  63. * a TensorFlow Lite model using optimized CMSIS-NN kernels.
  64. *
  65. * Block Diagram
  66. * --------
  67. * \image html CMSIS-NN-OVERVIEW.PNG
  68. *
  69. * Examples
  70. * --------
  71. *
  72. * The library ships with a number of examples which demonstrate how to use the library functions.
  73. *
  74. * Pre-processor Macros
  75. * ------------
  76. *
  77. * Each library project have different pre-processor macros.
  78. *
  79. * - ARM_MATH_DSP:
  80. *
  81. * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions(DSP extension).
  82. *
  83. * - ARM_MATH_MVEI:
  84. *
  85. * Define macro ARM_MATH_MVEI, If the silicon supports M-Profile Vector Extension.
  86. * - ARM_MATH_AUTOVECTORIZE
  87. * Used in conjucture with ARM_MATH_MVEI to let the compiler auto vectorize for the functions that uses inline
  88. * assembly. It does not affect functions that use C or intrinsics.
  89. * - ARM_MATH_BIG_ENDIAN:
  90. *
  91. * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. This is supported only for the legacy
  92. * functions i.e, functions targetted at TensorFlow Lite do not support big endianness. By default library builds for
  93. * little endian targets.
  94. *
  95. * - ARM_NN_TRUNCATE:
  96. *
  97. * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
  98. *
  99. *
  100. * Copyright Notice
  101. * ------------
  102. *
  103. * Copyright (C) 2010-2019 Arm Limited. All rights reserved.
  104. *
  105. * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
  106. *
  107. * [2] Converting a Neural Network for Arm Cortex-M with CMSIS-NN
  108. *
  109. https://developer.arm.com/solutions/machine-learning-on-arm/developer-material/how-to-guides/converting-a-neural-network-for-arm-cortex-m-with-cmsis-nn/single-page
  110. * [3] https://www.tensorflow.org/lite/microcontrollers/library
  111. *
  112. * [4] https://github.com/ARM-software/CMSIS_5/tree/develop/CMSIS/NN#legacy-vs-tfl-micro-compliant-apis
  113. */
  114. /**
  115. * @defgroup groupNN Neural Network Functions
  116. * A collection of functions to perform basic operations for neural network layers. Functions with a _s8 suffix support
  117. * TensorFlow Lite framework.
  118. */
  119. #ifndef _ARM_NNFUNCTIONS_H
  120. #define _ARM_NNFUNCTIONS_H
  121. #include "arm_math_types.h"
  122. #include "arm_nn_types.h"
  123. #define USE_INTRINSIC
  124. //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
  125. #ifdef __cplusplus
  126. extern "C" {
  127. #endif
  128. /**
  129. * @brief Struct for specifying activation function types
  130. *
  131. */
  132. typedef enum
  133. {
  134. ARM_SIGMOID = 0,
  135. /**< Sigmoid activation function */
  136. ARM_TANH = 1,
  137. /**< Tanh activation function */
  138. } arm_nn_activation_type;
  139. /**
  140. * @defgroup NNConv Convolution Functions
  141. *
  142. * Collection of convolution, depthwise convolution functions and their variants.
  143. *
  144. * The convolution is implemented in 2 steps: im2col and GEMM
  145. *
  146. * im2col is a process of converting each patch of image data into
  147. * a column. After im2col, the convolution is computed as matrix-matrix
  148. * multiplication.
  149. *
  150. * To reduce the memory footprint, the im2col is performed partially.
  151. * Each iteration, only a few column (i.e., patches) are generated and
  152. * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
  153. *
  154. */
  155. /**
  156. * @brief s8 convolution layer wrapper function with the main purpose to call the optimal kernel available in
  157. cmsis-nn
  158. * to perform the convolution.
  159. *
  160. * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
  161. arm_convolve_wrapper_s8_get_buffer_size will return the buffer_size if required
  162. * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
  163. * Range of conv_params->input_offset : [-127, 128]
  164. * Range of conv_params->output_offset : [-128, 127]
  165. * @param[in] quant_params Per-channel quantization info.
  166. * It contains the multiplier and shift values to be applied to each output channel
  167. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  168. * @param[in] input_data Input (activation) data pointer. Data type: int8
  169. * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
  170. * spatial filter dimensions
  171. * @param[in] filter_data Filter data pointer. Data type: int8
  172. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  173. * @param[in] bias_data Bias data pointer. Data type: int32
  174. * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
  175. * @param[out] output_data Output data pointer. Data type: int8
  176. *
  177. * @return The function returns either
  178. * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
  179. * <code>ARM_MATH_SUCCESS</code> on successful completion.
  180. *
  181. */
  182. arm_status arm_convolve_wrapper_s8(const cmsis_nn_context *ctx,
  183. const cmsis_nn_conv_params *conv_params,
  184. const cmsis_nn_per_channel_quant_params *quant_params,
  185. const cmsis_nn_dims *input_dims,
  186. const q7_t *input_data,
  187. const cmsis_nn_dims *filter_dims,
  188. const q7_t *filter_data,
  189. const cmsis_nn_dims *bias_dims,
  190. const int32_t *bias_data,
  191. const cmsis_nn_dims *output_dims,
  192. q7_t *output_data);
  193. /**
  194. * @brief Get the required buffer size for arm_convolve_wrapper_s8
  195. *
  196. * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
  197. * Range of conv_params->input_offset : [-127, 128]
  198. * Range of conv_params->output_offset : [-128, 127]
  199. * @param[in] input_dims Input (activation) dimensions. Format: [N, H, W, C_IN]
  200. * @param[in] filter_dims Filter dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the spatial
  201. * filter dimensions
  202. * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
  203. *
  204. * @return The function returns required buffer size(bytes)
  205. *
  206. */
  207. int32_t arm_convolve_wrapper_s8_get_buffer_size(const cmsis_nn_conv_params *conv_params,
  208. const cmsis_nn_dims *input_dims,
  209. const cmsis_nn_dims *filter_dims,
  210. const cmsis_nn_dims *output_dims);
  211. /**
  212. * @brief Basic s8 convolution function
  213. * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
  214. arm_convolve_s8_get_buffer_size will return the buffer_size if required
  215. * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
  216. * Range of conv_params->input_offset : [-127, 128]
  217. * Range of conv_params->output_offset : [-128, 127]
  218. * @param[in] quant_params Per-channel quantization info.
  219. * It contains the multiplier and shift values to be applied to each output channel
  220. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  221. * @param[in] input_data Input (activation) data pointer. Data type: int8
  222. * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK are the
  223. * spatial filter dimensions
  224. * @param[in] filter_data Filter data pointer. Data type: int8
  225. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  226. * @param[in] bias_data Optional bias data pointer. Data type: int32
  227. * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
  228. * @param[out] output_data Output data pointer. Data type: int8
  229. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  230. *
  231. * @details
  232. * 1. Supported framework: TensorFlow Lite micro
  233. * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  234. * 3. Additional memory is required for optimization. Refer to argument 'ctx' for details.
  235. *
  236. */
  237. arm_status arm_convolve_s8(const cmsis_nn_context *ctx,
  238. const cmsis_nn_conv_params *conv_params,
  239. const cmsis_nn_per_channel_quant_params *quant_params,
  240. const cmsis_nn_dims *input_dims,
  241. const q7_t *input_data,
  242. const cmsis_nn_dims *filter_dims,
  243. const q7_t *filter_data,
  244. const cmsis_nn_dims *bias_dims,
  245. const int32_t *bias_data,
  246. const cmsis_nn_dims *output_dims,
  247. q7_t *output_data);
  248. /**
  249. * @brief Get the required buffer size for s8 convolution function
  250. *
  251. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  252. * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, HK, WK, C_IN] where HK and WK
  253. * are the spatial filter dimensions
  254. * @return The function returns required buffer size(bytes)
  255. *
  256. */
  257. int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
  258. /**
  259. * @brief Basic Q7 convolution function
  260. * @param[in] Im_in pointer to input tensor
  261. * @param[in] dim_im_in input tensor dimension
  262. * @param[in] ch_im_in number of input tensor channels
  263. * @param[in] wt pointer to kernel weights
  264. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  265. * @param[in] dim_kernel filter kernel size
  266. * @param[in] padding padding sizes
  267. * @param[in] stride convolution stride
  268. * @param[in] bias pointer to bias
  269. * @param[in] bias_shift amount of left-shift for bias
  270. * @param[in] out_shift amount of right-shift for output
  271. * @param[in,out] Im_out pointer to output tensor
  272. * @param[in] dim_im_out output tensor dimension
  273. * @param[in,out] bufferA pointer to buffer space for input
  274. * @param[in,out] bufferB pointer to buffer space for output
  275. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  276. *
  277. */
  278. arm_status arm_convolve_HWC_q7_basic(const q7_t *Im_in,
  279. const uint16_t dim_im_in,
  280. const uint16_t ch_im_in,
  281. const q7_t *wt,
  282. const uint16_t ch_im_out,
  283. const uint16_t dim_kernel,
  284. const uint16_t padding,
  285. const uint16_t stride,
  286. const q7_t *bias,
  287. const uint16_t bias_shift,
  288. const uint16_t out_shift,
  289. q7_t *Im_out,
  290. const uint16_t dim_im_out,
  291. q15_t *bufferA,
  292. q7_t *bufferB);
  293. /**
  294. * @brief Basic Q7 convolution function (non-square shape)
  295. * @param[in] Im_in pointer to input tensor
  296. * @param[in] dim_im_in_x input tensor dimension x
  297. * @param[in] dim_im_in_y input tensor dimension y
  298. * @param[in] ch_im_in number of input tensor channels
  299. * @param[in] wt pointer to kernel weights
  300. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  301. * @param[in] dim_kernel_x filter kernel size x
  302. * @param[in] dim_kernel_y filter kernel size y
  303. * @param[in] padding_x padding size x
  304. * @param[in] padding_y padding size y
  305. * @param[in] stride_x convolution stride x
  306. * @param[in] stride_y convolution stride y
  307. * @param[in] bias pointer to bias
  308. * @param[in] bias_shift amount of left-shift for bias
  309. * @param[in] out_shift amount of right-shift for output
  310. * @param[in,out] Im_out pointer to output tensor
  311. * @param[in] dim_im_out_x output tensor dimension x
  312. * @param[in] dim_im_out_y output tensor dimension y
  313. * @param[in,out] bufferA pointer to buffer space for input
  314. * @param[in,out] bufferB pointer to buffer space for output
  315. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  316. */
  317. arm_status arm_convolve_HWC_q7_basic_nonsquare(const q7_t *Im_in,
  318. const uint16_t dim_im_in_x,
  319. const uint16_t dim_im_in_y,
  320. const uint16_t ch_im_in,
  321. const q7_t *wt,
  322. const uint16_t ch_im_out,
  323. const uint16_t dim_kernel_x,
  324. const uint16_t dim_kernel_y,
  325. const uint16_t padding_x,
  326. const uint16_t padding_y,
  327. const uint16_t stride_x,
  328. const uint16_t stride_y,
  329. const q7_t *bias,
  330. const uint16_t bias_shift,
  331. const uint16_t out_shift,
  332. q7_t *Im_out,
  333. const uint16_t dim_im_out_x,
  334. const uint16_t dim_im_out_y,
  335. q15_t *bufferA,
  336. q7_t *bufferB);
  337. /**
  338. * @brief Basic Q15 convolution function
  339. * @param[in] Im_in pointer to input tensor
  340. * @param[in] dim_im_in input tensor dimension
  341. * @param[in] ch_im_in number of input tensor channels
  342. * @param[in] wt pointer to kernel weights
  343. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  344. * @param[in] dim_kernel filter kernel size
  345. * @param[in] padding padding sizes
  346. * @param[in] stride convolution stride
  347. * @param[in] bias pointer to bias
  348. * @param[in] bias_shift amount of left-shift for bias
  349. * @param[in] out_shift amount of right-shift for output
  350. * @param[in,out] Im_out pointer to output tensor
  351. * @param[in] dim_im_out output tensor dimension
  352. * @param[in,out] bufferA pointer to buffer space for input
  353. * @param[in,out] bufferB pointer to buffer space for output
  354. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  355. *
  356. */
  357. arm_status arm_convolve_HWC_q15_basic(const q15_t *Im_in,
  358. const uint16_t dim_im_in,
  359. const uint16_t ch_im_in,
  360. const q15_t *wt,
  361. const uint16_t ch_im_out,
  362. const uint16_t dim_kernel,
  363. const uint16_t padding,
  364. const uint16_t stride,
  365. const q15_t *bias,
  366. const uint16_t bias_shift,
  367. const uint16_t out_shift,
  368. q15_t *Im_out,
  369. const uint16_t dim_im_out,
  370. q15_t *bufferA,
  371. q7_t *bufferB);
  372. /**
  373. * @brief Fast Q7 convolution function
  374. * @param[in] Im_in pointer to input tensor
  375. * @param[in] dim_im_in input tensor dimension
  376. * @param[in] ch_im_in number of input tensor channels
  377. * @param[in] wt pointer to kernel weights
  378. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  379. * @param[in] dim_kernel filter kernel size
  380. * @param[in] padding padding sizes
  381. * @param[in] stride convolution stride
  382. * @param[in] bias pointer to bias
  383. * @param[in] bias_shift amount of left-shift for bias
  384. * @param[in] out_shift amount of right-shift for output
  385. * @param[in,out] Im_out pointer to output tensor
  386. * @param[in] dim_im_out output tensor dimension
  387. * @param[in,out] bufferA pointer to buffer space for input
  388. * @param[in,out] bufferB pointer to buffer space for output
  389. * @return The function returns either
  390. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  391. *
  392. * This function is the version with full list of optimization tricks, but with
  393. * some contraints:
  394. * ch_im_in is multiple of 4
  395. * ch_im_out is multiple of 2
  396. */
  397. arm_status arm_convolve_HWC_q7_fast(const q7_t *Im_in,
  398. const uint16_t dim_im_in,
  399. const uint16_t ch_im_in,
  400. const q7_t *wt,
  401. const uint16_t ch_im_out,
  402. const uint16_t dim_kernel,
  403. const uint16_t padding,
  404. const uint16_t stride,
  405. const q7_t *bias,
  406. const uint16_t bias_shift,
  407. const uint16_t out_shift,
  408. q7_t *Im_out,
  409. const uint16_t dim_im_out,
  410. q15_t *bufferA,
  411. q7_t *bufferB);
  412. /**
  413. * @brief Fast Q7 convolution function (non-sqaure shape)
  414. * @param[in] Im_in pointer to input tensor
  415. * @param[in] dim_im_in_x input tensor dimension x
  416. * @param[in] dim_im_in_y input tensor dimension y
  417. * @param[in] ch_im_in number of input tensor channels
  418. * @param[in] wt pointer to kernel weights
  419. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  420. * @param[in] dim_kernel_x filter kernel size x
  421. * @param[in] dim_kernel_y filter kernel size y
  422. * @param[in] padding_x padding size x
  423. * @param[in] padding_y padding size y
  424. * @param[in] stride_x convolution stride x
  425. * @param[in] stride_y convolution stride y
  426. * @param[in] bias pointer to bias
  427. * @param[in] bias_shift amount of left-shift for bias
  428. * @param[in] out_shift amount of right-shift for output
  429. * @param[in,out] Im_out pointer to output tensor
  430. * @param[in] dim_im_out_x output tensor dimension x
  431. * @param[in] dim_im_out_y output tensor dimension y
  432. * @param[in,out] bufferA pointer to buffer space for input
  433. * @param[in,out] bufferB pointer to buffer space for output
  434. * @return The function returns either
  435. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  436. *
  437. * This function is the version with full list of optimization tricks, but with
  438. * some contraints:
  439. * ch_im_in is multiple of 4
  440. * ch_im_out is multiple of 2
  441. */
  442. arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t *Im_in,
  443. const uint16_t dim_im_in_x,
  444. const uint16_t dim_im_in_y,
  445. const uint16_t ch_im_in,
  446. const q7_t *wt,
  447. const uint16_t ch_im_out,
  448. const uint16_t dim_kernel_x,
  449. const uint16_t dim_kernel_y,
  450. const uint16_t padding_x,
  451. const uint16_t padding_y,
  452. const uint16_t stride_x,
  453. const uint16_t stride_y,
  454. const q7_t *bias,
  455. const uint16_t bias_shift,
  456. const uint16_t out_shift,
  457. q7_t *Im_out,
  458. const uint16_t dim_im_out_x,
  459. const uint16_t dim_im_out_y,
  460. q15_t *bufferA,
  461. q7_t *bufferB);
  462. /**
  463. * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
  464. * @param[in] Im_in pointer to input tensor
  465. * @param[in] dim_im_in_x input tensor dimension x
  466. * @param[in] dim_im_in_y input tensor dimension y
  467. * @param[in] ch_im_in number of input tensor channels
  468. * @param[in] wt pointer to kernel weights
  469. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  470. * @param[in] dim_kernel_x filter kernel size x
  471. * @param[in] dim_kernel_y filter kernel size y
  472. * @param[in] padding_x padding size x
  473. * @param[in] padding_y padding size y
  474. * @param[in] stride_x convolution stride x
  475. * @param[in] stride_y convolution stride y
  476. * @param[in] bias pointer to bias
  477. * @param[in] bias_shift amount of left-shift for bias
  478. * @param[in] out_shift amount of right-shift for output
  479. * @param[in,out] Im_out pointer to output tensor
  480. * @param[in] dim_im_out_x output tensor dimension x
  481. * @param[in] dim_im_out_y output tensor dimension y
  482. * @param[in,out] bufferA pointer to buffer space for input
  483. * @param[in,out] bufferB pointer to buffer space for output
  484. * @return The function returns either
  485. * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
  486. * <code>ARM_MATH_SUCCESS</code> on successful completion.
  487. *
  488. * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
  489. * and dim_kernel_y=1). It can be used for
  490. * second half of MobileNets after depthwise separable convolution.
  491. *
  492. * This function is the version with full list of optimization tricks, but with
  493. * some contraints:
  494. * ch_im_in is multiple of 4
  495. * ch_im_out is multiple of 2
  496. */
  497. arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t *Im_in,
  498. const uint16_t dim_im_in_x,
  499. const uint16_t dim_im_in_y,
  500. const uint16_t ch_im_in,
  501. const q7_t *wt,
  502. const uint16_t ch_im_out,
  503. const uint16_t dim_kernel_x,
  504. const uint16_t dim_kernel_y,
  505. const uint16_t padding_x,
  506. const uint16_t padding_y,
  507. const uint16_t stride_x,
  508. const uint16_t stride_y,
  509. const q7_t *bias,
  510. const uint16_t bias_shift,
  511. const uint16_t out_shift,
  512. q7_t *Im_out,
  513. const uint16_t dim_im_out_x,
  514. const uint16_t dim_im_out_y,
  515. q15_t *bufferA,
  516. q7_t *bufferB);
  517. /**
  518. * @brief Fast s8 version for 1x1 convolution (non-square shape)
  519. *
  520. * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
  521. arm_convolve_1x1_s8_fast_get_buffer_size will return the buffer_size if required
  522. * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
  523. * Range of conv_params->input_offset : [-127, 128]
  524. * Range of conv_params->output_offset : [-128, 127]
  525. * @param[in] quant_params Per-channel quantization info.
  526. * It contains the multiplier and shift values to be applied to each output channel
  527. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  528. * @param[in] input_data Input (activation) data pointer. Data type: int8
  529. * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, 1, C_IN]
  530. * @param[in] filter_data Filter data pointer. Data type: int8
  531. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  532. * @param[in] bias_data Optional bias data pointer. Data type: int32
  533. * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
  534. * @param[out] output_data Output data pointer. Data type: int8
  535. *
  536. * @return The function returns either
  537. * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
  538. * <code>ARM_MATH_SUCCESS</code> on successful completion.
  539. *
  540. * @details
  541. * - Supported framework : TensorFlow Lite Micro
  542. * - The following constrains on the arguments apply
  543. * -# input_dims->c is a multiple of 4
  544. * -# conv_params->padding.w = conv_params->padding.h = 0
  545. * -# conv_params->stride.w = conv_params->stride.h = 1
  546. *
  547. */
  548. arm_status arm_convolve_1x1_s8_fast(const cmsis_nn_context *ctx,
  549. const cmsis_nn_conv_params *conv_params,
  550. const cmsis_nn_per_channel_quant_params *quant_params,
  551. const cmsis_nn_dims *input_dims,
  552. const q7_t *input_data,
  553. const cmsis_nn_dims *filter_dims,
  554. const q7_t *filter_data,
  555. const cmsis_nn_dims *bias_dims,
  556. const int32_t *bias_data,
  557. const cmsis_nn_dims *output_dims,
  558. q7_t *output_data);
  559. /**
  560. * @brief Get the required buffer size for arm_convolve_1x1_s8_fast
  561. *
  562. * @param[in] input_dims Input (activation) dimensions
  563. * @return The function returns the required buffer size in bytes
  564. *
  565. */
  566. int32_t arm_convolve_1x1_s8_fast_get_buffer_size(const cmsis_nn_dims *input_dims);
  567. /**
  568. * @brief 1xn convolution
  569. *
  570. * @param[in, out] ctx Function context that contains the additional buffer if required by the function.
  571. arm_convolve_1_x_n_s8_get_buffer_size will return the buffer_size if required
  572. * @param[in] conv_params Convolution parameters (e.g. strides, dilations, pads,...).
  573. * Range of conv_params->input_offset : [-127, 128]
  574. * Range of conv_params->output_offset : [-128, 127]
  575. * @param[in] quant_params Per-channel quantization info.
  576. * It contains the multiplier and shift values to be applied to each output channel
  577. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  578. * @param[in] input_data Input (activation) data pointer. Data type: int8
  579. * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the horizontal
  580. * spatial filter dimension
  581. * @param[in] filter_data Filter data pointer. Data type: int8
  582. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  583. * @param[in] bias_data Optional bias data pointer. Data type: int32
  584. * @param[in] output_dims Output tensor dimensions. Format: [N, H, W, C_OUT]
  585. * @param[out] output_data Output data pointer. Data type: int8
  586. *
  587. * @return The function returns either
  588. * <code>ARM_MATH_SIZE_MISMATCH</code> if argument constraints fail. or,
  589. * <code>ARM_MATH_SUCCESS</code> on successful completion.
  590. *
  591. * @details
  592. * - Supported framework : TensorFlow Lite Micro
  593. * - The following constrains on the arguments apply
  594. * -# input_dims->n equals 1
  595. * -# ouput_dims->w is a multiple of 4
  596. * -# Explicit constraints(since it is for 1xN convolution)
  597. * -## input_dims->h equals 1
  598. * -## output_dims->h equals 1
  599. * -## filter_dims->h equals 1
  600. *@todo Remove constraint on output_dims->w to make the function generic.
  601. *
  602. */
  603. arm_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
  604. const cmsis_nn_conv_params *conv_params,
  605. const cmsis_nn_per_channel_quant_params *quant_params,
  606. const cmsis_nn_dims *input_dims,
  607. const q7_t *input_data,
  608. const cmsis_nn_dims *filter_dims,
  609. const q7_t *filter_data,
  610. const cmsis_nn_dims *bias_dims,
  611. const int32_t *bias_data,
  612. const cmsis_nn_dims *output_dims,
  613. q7_t *output_data);
  614. /**
  615. * @brief Get the required additional buffer size for 1xn convolution
  616. *
  617. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  618. * @param[in] filter_dims Filter tensor dimensions. Format: [C_OUT, 1, WK, C_IN] where WK is the
  619. * horizontal spatial filter dimension
  620. * @return The function returns required buffer size(bytes)
  621. *
  622. */
  623. int32_t arm_convolve_1_x_n_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
  624. /**
  625. * @brief Q7 version of convolution for RGB image
  626. * @param[in] Im_in pointer to input tensor
  627. * @param[in] dim_im_in input tensor dimension
  628. * @param[in] ch_im_in number of input tensor channels
  629. * @param[in] wt pointer to kernel weights
  630. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  631. * @param[in] dim_kernel filter kernel size
  632. * @param[in] padding padding sizes
  633. * @param[in] stride convolution stride
  634. * @param[in] bias pointer to bias
  635. * @param[in] bias_shift amount of left-shift for bias
  636. * @param[in] out_shift amount of right-shift for output
  637. * @param[in,out] Im_out pointer to output tensor
  638. * @param[in] dim_im_out output tensor dimension
  639. * @param[in,out] bufferA pointer to buffer space for input
  640. * @param[in,out] bufferB pointer to buffer space for output
  641. * @return The function returns either
  642. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  643. *
  644. * This kernel is written exclusively for convolution with ch_im_in
  645. * equals 3. This applies on the first layer of CNNs which has input
  646. * image with RGB format.
  647. */
  648. arm_status arm_convolve_HWC_q7_RGB(const q7_t *Im_in,
  649. const uint16_t dim_im_in,
  650. const uint16_t ch_im_in,
  651. const q7_t *wt,
  652. const uint16_t ch_im_out,
  653. const uint16_t dim_kernel,
  654. const uint16_t padding,
  655. const uint16_t stride,
  656. const q7_t *bias,
  657. const uint16_t bias_shift,
  658. const uint16_t out_shift,
  659. q7_t *Im_out,
  660. const uint16_t dim_im_out,
  661. q15_t *bufferA,
  662. q7_t *bufferB);
  663. /**
  664. * @brief Fast Q15 convolution function
  665. * @param[in] Im_in pointer to input tensor
  666. * @param[in] dim_im_in input tensor dimension
  667. * @param[in] ch_im_in number of input tensor channels
  668. * @param[in] wt pointer to kernel weights
  669. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  670. * @param[in] dim_kernel filter kernel size
  671. * @param[in] padding padding sizes
  672. * @param[in] stride convolution stride
  673. * @param[in] bias pointer to bias
  674. * @param[in] bias_shift amount of left-shift for bias
  675. * @param[in] out_shift amount of right-shift for output
  676. * @param[in,out] Im_out pointer to output tensor
  677. * @param[in] dim_im_out output tensor dimension
  678. * @param[in,out] bufferA pointer to buffer space for input
  679. * @param[in,out] bufferB pointer to buffer space for output
  680. * @return The function returns either
  681. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  682. *
  683. * This function is the version with full list of optimization tricks, but with
  684. * some contraints:
  685. * ch_im_in is multiple of 2
  686. * ch_im_out is multiple of 2
  687. */
  688. arm_status arm_convolve_HWC_q15_fast(const q15_t *Im_in,
  689. const uint16_t dim_im_in,
  690. const uint16_t ch_im_in,
  691. const q15_t *wt,
  692. const uint16_t ch_im_out,
  693. const uint16_t dim_kernel,
  694. const uint16_t padding,
  695. const uint16_t stride,
  696. const q15_t *bias,
  697. const uint16_t bias_shift,
  698. const uint16_t out_shift,
  699. q15_t *Im_out,
  700. const uint16_t dim_im_out,
  701. q15_t *bufferA,
  702. q7_t *bufferB);
  703. /**
  704. * @brief Fast Q15 convolution function (non-sqaure shape)
  705. * @param[in] Im_in pointer to input tensor
  706. * @param[in] dim_im_in_x input tensor dimension x
  707. * @param[in] dim_im_in_y input tensor dimension y
  708. * @param[in] ch_im_in number of input tensor channels
  709. * @param[in] wt pointer to kernel weights
  710. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  711. * @param[in] dim_kernel_x filter kernel size x
  712. * @param[in] dim_kernel_y filter kernel size y
  713. * @param[in] padding_x padding size x
  714. * @param[in] padding_y padding size y
  715. * @param[in] stride_x convolution stride x
  716. * @param[in] stride_y convolution stride y
  717. * @param[in] bias pointer to bias
  718. * @param[in] bias_shift amount of left-shift for bias
  719. * @param[in] out_shift amount of right-shift for output
  720. * @param[in,out] Im_out pointer to output tensor
  721. * @param[in] dim_im_out_x output tensor dimension x
  722. * @param[in] dim_im_out_y output tensor dimension y
  723. * @param[in,out] bufferA pointer to buffer space for input
  724. * @param[in,out] bufferB pointer to buffer space for output
  725. * @return The function returns either
  726. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  727. *
  728. * @details
  729. *
  730. * <b>Buffer size:</b>
  731. *
  732. * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
  733. *
  734. * bufferB size: 0
  735. *
  736. * <b>Input dimension constraints:</b>
  737. *
  738. * ch_im_in is multiple of 2
  739. *
  740. * ch_im_out is multipe of 2
  741. *
  742. */
  743. arm_status arm_convolve_HWC_q15_fast_nonsquare(const q15_t *Im_in,
  744. const uint16_t dim_im_in_x,
  745. const uint16_t dim_im_in_y,
  746. const uint16_t ch_im_in,
  747. const q15_t *wt,
  748. const uint16_t ch_im_out,
  749. const uint16_t dim_kernel_x,
  750. const uint16_t dim_kernel_y,
  751. const uint16_t padding_x,
  752. const uint16_t padding_y,
  753. const uint16_t stride_x,
  754. const uint16_t stride_y,
  755. const q15_t *bias,
  756. const uint16_t bias_shift,
  757. const uint16_t out_shift,
  758. q15_t *Im_out,
  759. const uint16_t dim_im_out_x,
  760. const uint16_t dim_im_out_y,
  761. q15_t *bufferA,
  762. q7_t *bufferB);
  763. /**
  764. * @brief Q7 depthwise separable convolution function
  765. * @param[in] Im_in pointer to input tensor
  766. * @param[in] dim_im_in input tensor dimension
  767. * @param[in] ch_im_in number of input tensor channels
  768. * @param[in] wt pointer to kernel weights
  769. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  770. * @param[in] dim_kernel filter kernel size
  771. * @param[in] padding padding sizes
  772. * @param[in] stride convolution stride
  773. * @param[in] bias pointer to bias
  774. * @param[in] bias_shift amount of left-shift for bias
  775. * @param[in] out_shift amount of right-shift for output
  776. * @param[in,out] Im_out pointer to output tensor
  777. * @param[in] dim_im_out output tensor dimension
  778. * @param[in,out] bufferA pointer to buffer space for input
  779. * @param[in,out] bufferB pointer to buffer space for output
  780. * @return The function returns either
  781. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  782. *
  783. * This function is the version with full list of optimization tricks, but with
  784. * some contraints:
  785. * ch_im_in is multiple of 2
  786. * ch_im_out is multiple of 2
  787. */
  788. arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t *Im_in,
  789. const uint16_t dim_im_in,
  790. const uint16_t ch_im_in,
  791. const q7_t *wt,
  792. const uint16_t ch_im_out,
  793. const uint16_t dim_kernel,
  794. const uint16_t padding,
  795. const uint16_t stride,
  796. const q7_t *bias,
  797. const uint16_t bias_shift,
  798. const uint16_t out_shift,
  799. q7_t *Im_out,
  800. const uint16_t dim_im_out,
  801. q15_t *bufferA,
  802. q7_t *bufferB);
  803. /**
  804. * @brief Q7 depthwise separable convolution function (non-square shape)
  805. * @param[in] Im_in pointer to input tensor
  806. * @param[in] dim_im_in_x input tensor dimension x
  807. * @param[in] dim_im_in_y input tensor dimension y
  808. * @param[in] ch_im_in number of input tensor channels
  809. * @param[in] wt pointer to kernel weights
  810. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  811. * @param[in] dim_kernel_x filter kernel size x
  812. * @param[in] dim_kernel_y filter kernel size y
  813. * @param[in] padding_x padding sizes x
  814. * @param[in] padding_y padding sizes y
  815. * @param[in] stride_x convolution stride x
  816. * @param[in] stride_y convolution stride y
  817. * @param[in] bias pointer to bias
  818. * @param[in] bias_shift amount of left-shift for bias
  819. * @param[in] out_shift amount of right-shift for output
  820. * @param[in,out] Im_out pointer to output tensor
  821. * @param[in] dim_im_out_x output tensor dimension x
  822. * @param[in] dim_im_out_y output tensor dimension y
  823. * @param[in,out] bufferA pointer to buffer space for input
  824. * @param[in,out] bufferB pointer to buffer space for output
  825. * @return The function returns either
  826. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  827. *
  828. * This function is the version with full list of optimization tricks, but with
  829. * some contraints:
  830. * ch_im_in is multiple of 2
  831. * ch_im_out is multiple of 2
  832. */
  833. arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t *Im_in,
  834. const uint16_t dim_im_in_x,
  835. const uint16_t dim_im_in_y,
  836. const uint16_t ch_im_in,
  837. const q7_t *wt,
  838. const uint16_t ch_im_out,
  839. const uint16_t dim_kernel_x,
  840. const uint16_t dim_kernel_y,
  841. const uint16_t padding_x,
  842. const uint16_t padding_y,
  843. const uint16_t stride_x,
  844. const uint16_t stride_y,
  845. const q7_t *bias,
  846. const uint16_t bias_shift,
  847. const uint16_t out_shift,
  848. q7_t *Im_out,
  849. const uint16_t dim_im_out_x,
  850. const uint16_t dim_im_out_y,
  851. q15_t *bufferA,
  852. q7_t *bufferB);
  853. /**
  854. * @brief Wrapper function to pick the right optimized s8 depthwise convolution function
  855. *
  856. * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
  857. * definition file to see if an additional buffer is required.
  858. * Optional function {API}_get_buffer_size() provides the buffer
  859. * size if required.
  860. * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
  861. * dw_conv_params->dilation is not used.
  862. * Range of dw_conv_params->input_offset : [-127, 128]
  863. * Range of dw_conv_params->output_offset : [-128, 127]
  864. * @param[in] quant_params Per-channel quantization info.
  865. * It contains the multiplier and shift values to be applied to each
  866. * output channel
  867. * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
  868. * Batch argument N is not used and assumed to be 1.
  869. * @param[in] input_data Input (activation) data pointer. Data type: int8
  870. * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
  871. * @param[in] filter_data Filter data pointer. Data type: int8
  872. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  873. * @param[in] bias_data Bias data pointer. Data type: int32
  874. * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
  875. * @param[in, out] output_data Output data pointer. Data type: int8
  876. * @return The function returns
  877. * <code>ARM_MATH_SUCCESS</code> - Successful completion.
  878. *
  879. * @details
  880. * - Supported framework: TensorFlow Lite
  881. * - Picks one of the the following functions
  882. * -# arm_depthwise_conv_s8()
  883. * -# arm_depthwise_conv_3x3_s8() - Cortex-M CPUs with DSP extension only
  884. * -# arm_depthwise_conv_s8_opt()
  885. * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  886. * - Check details of arm_depthwise_conv_s8_opt() for potential data that can be accessed outside of the
  887. * boundary.
  888. */
  889. arm_status arm_depthwise_conv_wrapper_s8(const cmsis_nn_context *ctx,
  890. const cmsis_nn_dw_conv_params *dw_conv_params,
  891. const cmsis_nn_per_channel_quant_params *quant_params,
  892. const cmsis_nn_dims *input_dims,
  893. const q7_t *input_data,
  894. const cmsis_nn_dims *filter_dims,
  895. const q7_t *filter_data,
  896. const cmsis_nn_dims *bias_dims,
  897. const int32_t *bias_data,
  898. const cmsis_nn_dims *output_dims,
  899. q7_t *output_data);
  900. /**
  901. * @brief Get size of additional buffer required by arm_depthwise_conv_wrapper_s8()
  902. *
  903. * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
  904. * dw_conv_params->dilation is not used.
  905. * Range of dw_conv_params->input_offset : [-127, 128]
  906. * Range of dw_conv_params->input_offset : [-128, 127]
  907. * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
  908. * Batch argument N is not used and assumed to be 1.
  909. * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
  910. * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
  911. * @return Size of additional memory required for optimizations in bytes.
  912. *
  913. */
  914. int32_t arm_depthwise_conv_wrapper_s8_get_buffer_size(const cmsis_nn_dw_conv_params *dw_conv_params,
  915. const cmsis_nn_dims *input_dims,
  916. const cmsis_nn_dims *filter_dims,
  917. const cmsis_nn_dims *output_dims);
  918. /**
  919. * @brief Basic s8 depthwise convolution function that doesn't have any constraints on the input dimensions.
  920. *
  921. * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
  922. * definition file to see if an additional buffer is required.
  923. * Optional function {API}_get_buffer_size() provides the buffer
  924. * size if an additional buffer is required.
  925. * exists if additional memory is.
  926. * @param[in] dw_conv_params Depthwise convolution parameters (e.g. strides, dilations, pads,...)
  927. * dw_conv_params->dilation is not used.
  928. * Range of dw_conv_params->input_offset : [-127, 128]
  929. * Range of dw_conv_params->input_offset : [-128, 127]
  930. * @param[in] quant_params Per-channel quantization info.
  931. * It contains the multiplier and shift values to be applied to each
  932. * output channel
  933. * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
  934. * Batch argument N is not used.
  935. * @param[in] input_data Input (activation) data pointer. Data type: int8
  936. * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
  937. * @param[in] filter_data Filter data pointer. Data type: int8
  938. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  939. * @param[in] bias_data Bias data pointer. Data type: int32
  940. * @param[in] output_dims Output tensor dimensions. Format: [1, H, W, C_OUT]
  941. * @param[in, out] output_data Output data pointer. Data type: int8
  942. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  943. *
  944. * @details
  945. * - Supported framework: TensorFlow Lite
  946. * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  947. */
  948. arm_status arm_depthwise_conv_s8(const cmsis_nn_context *ctx,
  949. const cmsis_nn_dw_conv_params *dw_conv_params,
  950. const cmsis_nn_per_channel_quant_params *quant_params,
  951. const cmsis_nn_dims *input_dims,
  952. const q7_t *input_data,
  953. const cmsis_nn_dims *filter_dims,
  954. const q7_t *filter_data,
  955. const cmsis_nn_dims *bias_dims,
  956. const int32_t *bias_data,
  957. const cmsis_nn_dims *output_dims,
  958. q7_t *output_data);
  959. /**
  960. * @brief Optimized s8 depthwise convolution function for 3x3 kernel size with some constraints on
  961. * the input arguments(documented below). Refer arm_depthwise_conv_s8() for function
  962. * argument details.
  963. *
  964. * @return The function returns one of the following
  965. * <code>ARM_MATH_SIZE_MISMATCH</code> - Unsupported dimension of tensors
  966. * <code>ARM_MATH_ARGUMENT_ERROR</code> - Unsupported pad size along the x axis
  967. * <code>ARM_MATH_SUCCESS</code> - Successful operation
  968. *
  969. * @details
  970. * - Supported framework : TensorFlow Lite Micro
  971. * - The following constrains on the arguments apply
  972. * -# Number of input channel equals number of output channels
  973. * -# Filter height and width equals 3
  974. * -# Padding along x is either 0 or 1.
  975. *
  976. */
  977. arm_status arm_depthwise_conv_3x3_s8(const cmsis_nn_context *ctx,
  978. const cmsis_nn_dw_conv_params *dw_conv_params,
  979. const cmsis_nn_per_channel_quant_params *quant_params,
  980. const cmsis_nn_dims *input_dims,
  981. const q7_t *input_data,
  982. const cmsis_nn_dims *filter_dims,
  983. const q7_t *filter_data,
  984. const cmsis_nn_dims *bias_dims,
  985. const int32_t *bias_data,
  986. const cmsis_nn_dims *output_dims,
  987. q7_t *output_data);
  988. /**
  989. * @brief Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel.
  990. * Refer arm_depthwise_conv_s8() for function argument details.
  991. *
  992. * @return The function returns one of the following
  993. * <code>ARM_MATH_SIZE_MISMATCH</code> - input channel != output channel or
  994. * ch_mult != 1
  995. * <code>ARM_MATH_SUCCESS</code> - Successful operation
  996. *
  997. * @note If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out
  998. * for the following if MVE optimizations(Arm Helium Technology) are used.
  999. * - Output shift
  1000. * - Output multiplier
  1001. * - Output bias
  1002. * - kernel
  1003. * @details
  1004. * - Supported framework: TensorFlow Lite
  1005. * - The following constrains on the arguments apply
  1006. * -# Number of input channel equals number of output channels or ch_mult equals 1
  1007. * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  1008. * - Reccomended when number of channels is 4 or greater.
  1009. *
  1010. */
  1011. arm_status arm_depthwise_conv_s8_opt(const cmsis_nn_context *ctx,
  1012. const cmsis_nn_dw_conv_params *dw_conv_params,
  1013. const cmsis_nn_per_channel_quant_params *quant_params,
  1014. const cmsis_nn_dims *input_dims,
  1015. const q7_t *input_data,
  1016. const cmsis_nn_dims *filter_dims,
  1017. const q7_t *filter_data,
  1018. const cmsis_nn_dims *bias_dims,
  1019. const int32_t *bias_data,
  1020. const cmsis_nn_dims *output_dims,
  1021. q7_t *output_data);
  1022. /**
  1023. * @brief Get the required buffer size for optimized s8 depthwise convolution
  1024. * function with constraint that in_channel equals out_channel.
  1025. * @param[in] input_dims Input (activation) tensor dimensions. Format: [1, H, W, C_IN]
  1026. * Batch argument N is not used.
  1027. * @param[in] filter_dims Filter tensor dimensions. Format: [1, H, W, C_OUT]
  1028. * @return The function returns required buffer size in bytes
  1029. *
  1030. */
  1031. int32_t arm_depthwise_conv_s8_opt_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims);
  1032. /**
  1033. * @defgroup FC Fully-connected Layer Functions
  1034. *
  1035. * Collection of fully-connected and matrix multiplication functions.
  1036. *
  1037. * Fully-connected layer is basically a matrix-vector multiplication
  1038. * with bias. The matrix is the weights and the input/output vectors
  1039. * are the activation values. Supported {weight, activation} precisions
  1040. * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
  1041. *
  1042. * Here we have two types of kernel functions. The basic function
  1043. * implements the function using regular GEMV approach. The opt functions
  1044. * operates with weights in interleaved formats.
  1045. *
  1046. */
  1047. /**
  1048. *@brief Q7 basic fully-connected layer function
  1049. *@param[in] pV pointer to input vector
  1050. *@param[in] pM pointer to matrix weights
  1051. *@param[in] dim_vec length of the vector
  1052. *@param[in] num_of_rows number of rows in weight matrix
  1053. *@param[in] bias_shift amount of left-shift for bias
  1054. *@param[in] out_shift amount of right-shift for output
  1055. *@param[in] bias pointer to bias
  1056. *@param[in,out] pOut pointer to output vector
  1057. *@param[in,out] vec_buffer pointer to buffer space for input
  1058. *@return The function returns <code>ARM_MATH_SUCCESS</code>
  1059. *
  1060. */
  1061. arm_status arm_fully_connected_q7(const q7_t *pV,
  1062. const q7_t *pM,
  1063. const uint16_t dim_vec,
  1064. const uint16_t num_of_rows,
  1065. const uint16_t bias_shift,
  1066. const uint16_t out_shift,
  1067. const q7_t *bias,
  1068. q7_t *pOut,
  1069. q15_t *vec_buffer);
  1070. /**
  1071. * @brief Basic s8 Fully Connected function.
  1072. *
  1073. * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
  1074. * definition file to see if an additional buffer is required.
  1075. * Optional function {API}_get_buffer_size() provides the buffer
  1076. * size if an additional buffer is required.
  1077. * @param[in] fc_params Fully Connected layer parameters (e.g. strides, dilations, pads,...)
  1078. * Range of fc_params->input_offset : [-127, 128]
  1079. * fc_params->filter_offset : 0
  1080. * Range of fc_params->output_offset : [-128, 127]
  1081. * @param[in] quant_params Per-tensor quantization info.
  1082. * It contains the multiplier and shift values to be applied to the output tensor.
  1083. * @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  1084. * Input dimension is taken as Nx(H * W * C_IN)
  1085. * @param[in] input_data Input (activation) data pointer. Data type: int8
  1086. * @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
  1087. * N : accumulation depth and equals (H * W * C_IN) from input_dims
  1088. * C : output depth and equals C_OUT in output_dims
  1089. * H & W : Not used
  1090. * @param[in] filter_data Filter data pointer. Data type: int8
  1091. * @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
  1092. * N, H, W : Not used
  1093. * @param[in] bias_data Bias data pointer. Data type: int32
  1094. * @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
  1095. * N : Batches
  1096. * C_OUT : Output depth
  1097. * H & W : Not used.
  1098. * @param[in, out] output_data Output data pointer. Data type: int8
  1099. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1100. *
  1101. * @details
  1102. * - Supported framework: TensorFlow Lite
  1103. * - q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  1104. */
  1105. arm_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
  1106. const cmsis_nn_fc_params *fc_params,
  1107. const cmsis_nn_per_tensor_quant_params *quant_params,
  1108. const cmsis_nn_dims *input_dims,
  1109. const q7_t *input_data,
  1110. const cmsis_nn_dims *filter_dims,
  1111. const q7_t *filter_data,
  1112. const cmsis_nn_dims *bias_dims,
  1113. const int32_t *bias_data,
  1114. const cmsis_nn_dims *output_dims,
  1115. q7_t *output_data);
  1116. /**
  1117. * @brief Get the required buffer size for S8 basic fully-connected and
  1118. * matrix multiplication layer function for TF Lite
  1119. * @param[in] filter_dims dimension of filter
  1120. * @return The function returns required buffer size in bytes
  1121. *
  1122. */
  1123. int32_t arm_fully_connected_s8_get_buffer_size(const cmsis_nn_dims *filter_dims);
  1124. /**
  1125. * @brief Q7 opt fully-connected layer function
  1126. * @param[in] pV pointer to input vector
  1127. * @param[in] pM pointer to matrix weights
  1128. * @param[in] dim_vec length of the vector
  1129. * @param[in] num_of_rows number of rows in weight matrix
  1130. * @param[in] bias_shift amount of left-shift for bias
  1131. * @param[in] out_shift amount of right-shift for output
  1132. * @param[in] bias pointer to bias
  1133. * @param[in,out] pOut pointer to output vector
  1134. * @param[in,out] vec_buffer pointer to buffer space for input
  1135. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1136. *
  1137. */
  1138. arm_status arm_fully_connected_q7_opt(const q7_t *pV,
  1139. const q7_t *pM,
  1140. const uint16_t dim_vec,
  1141. const uint16_t num_of_rows,
  1142. const uint16_t bias_shift,
  1143. const uint16_t out_shift,
  1144. const q7_t *bias,
  1145. q7_t *pOut,
  1146. q15_t *vec_buffer);
  1147. /**
  1148. * @brief Q15 basic fully-connected layer function
  1149. * @param[in] pV pointer to input vector
  1150. * @param[in] pM pointer to matrix weights
  1151. * @param[in] dim_vec length of the vector
  1152. * @param[in] num_of_rows number of rows in weight matrix
  1153. * @param[in] bias_shift amount of left-shift for bias
  1154. * @param[in] out_shift amount of right-shift for output
  1155. * @param[in] bias pointer to bias
  1156. * @param[in,out] pOut pointer to output vector
  1157. * @param[in,out] vec_buffer pointer to buffer space for input
  1158. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1159. *
  1160. */
  1161. arm_status arm_fully_connected_q15(const q15_t *pV,
  1162. const q15_t *pM,
  1163. const uint16_t dim_vec,
  1164. const uint16_t num_of_rows,
  1165. const uint16_t bias_shift,
  1166. const uint16_t out_shift,
  1167. const q15_t *bias,
  1168. q15_t *pOut,
  1169. q15_t *vec_buffer);
  1170. /**
  1171. * @brief Q15 opt fully-connected layer function
  1172. * @param[in] pV pointer to input vector
  1173. * @param[in] pM pointer to matrix weights
  1174. * @param[in] dim_vec length of the vector
  1175. * @param[in] num_of_rows number of rows in weight matrix
  1176. * @param[in] bias_shift amount of left-shift for bias
  1177. * @param[in] out_shift amount of right-shift for output
  1178. * @param[in] bias pointer to bias
  1179. * @param[in,out] pOut pointer to output vector
  1180. * @param[in,out] vec_buffer pointer to buffer space for input
  1181. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1182. *
  1183. */
  1184. arm_status arm_fully_connected_q15_opt(const q15_t *pV,
  1185. const q15_t *pM,
  1186. const uint16_t dim_vec,
  1187. const uint16_t num_of_rows,
  1188. const uint16_t bias_shift,
  1189. const uint16_t out_shift,
  1190. const q15_t *bias,
  1191. q15_t *pOut,
  1192. q15_t *vec_buffer);
  1193. /**
  1194. * @brief Mixed Q15-Q7 fully-connected layer function
  1195. * @param[in] pV pointer to input vector
  1196. * @param[in] pM pointer to matrix weights
  1197. * @param[in] dim_vec length of the vector
  1198. * @param[in] num_of_rows number of rows in weight matrix
  1199. * @param[in] bias_shift amount of left-shift for bias
  1200. * @param[in] out_shift amount of right-shift for output
  1201. * @param[in] bias pointer to bias
  1202. * @param[in,out] pOut pointer to output vector
  1203. * @param[in,out] vec_buffer pointer to buffer space for input
  1204. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1205. *
  1206. */
  1207. arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t *pV,
  1208. const q7_t *pM,
  1209. const uint16_t dim_vec,
  1210. const uint16_t num_of_rows,
  1211. const uint16_t bias_shift,
  1212. const uint16_t out_shift,
  1213. const q7_t *bias,
  1214. q15_t *pOut,
  1215. q15_t *vec_buffer);
  1216. /**
  1217. * @brief Mixed Q15-Q7 opt fully-connected layer function
  1218. * @param[in] pV pointer to input vector
  1219. * @param[in] pM pointer to matrix weights
  1220. * @param[in] dim_vec length of the vector
  1221. * @param[in] num_of_rows number of rows in weight matrix
  1222. * @param[in] bias_shift amount of left-shift for bias
  1223. * @param[in] out_shift amount of right-shift for output
  1224. * @param[in] bias pointer to bias
  1225. * @param[in,out] pOut pointer to output vector
  1226. * @param[in,out] vec_buffer pointer to buffer space for input
  1227. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1228. *
  1229. */
  1230. arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t *pV,
  1231. const q7_t *pM,
  1232. const uint16_t dim_vec,
  1233. const uint16_t num_of_rows,
  1234. const uint16_t bias_shift,
  1235. const uint16_t out_shift,
  1236. const q7_t *bias,
  1237. q15_t *pOut,
  1238. q15_t *vec_buffer);
  1239. /**
  1240. * @brief Matrix-Multiplication Kernels for Convolution
  1241. *
  1242. * These functions are used within convolution layer functions for
  1243. * matrix multiplication.
  1244. *
  1245. * The implementation is similar to CMSIS-DSP arm_mat_mult functions
  1246. * with one Q7 and one Q15 operands. The Q15 operand is the im2col
  1247. * output which is always with 2 columns.
  1248. *
  1249. */
  1250. /**
  1251. * @brief Matrix-multiplication function for convolution
  1252. * @param[in] pA pointer to operand A
  1253. * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
  1254. * @param[in] ch_im_out numRow of A
  1255. * @param[in] numCol_A numCol of A
  1256. * @param[in] bias_shift amount of left-shift for bias
  1257. * @param[in] out_shift amount of right-shift for output
  1258. * @param[in] bias the bias
  1259. * @param[in,out] pOut pointer to output
  1260. * @return The function returns the incremented output pointer
  1261. */
  1262. q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t *pA,
  1263. const q15_t *pInBuffer,
  1264. const uint16_t ch_im_out,
  1265. const uint16_t numCol_A,
  1266. const uint16_t bias_shift,
  1267. const uint16_t out_shift,
  1268. const q7_t *bias,
  1269. q7_t *pOut);
  1270. /**
  1271. * @brief Matrix-multiplication function for convolution with per-channel requantization.
  1272. * @param[in] input_a pointer to operand A
  1273. * @param[in] input_b pointer to operand B, always consists of 2 vectors.
  1274. * @param[in] output_ch number of rows of A
  1275. * @param[in] out_shift pointer to per output channel requantization shift parameter.
  1276. * @param[in] out_mult pointer to per output channel requantization multiplier parameter.
  1277. * @param[in] out_offset output tensor offset.
  1278. * @param[in] activation_min minimum value to clamp the output to. Range : int8
  1279. * @param[in] activation_max maximum value to clamp the output to. Range : int8
  1280. * @param[in] num_col_a number of columns of A
  1281. * @param[in] output_bias per output channel bias. Range : int32
  1282. * @param[in,out] out_0 pointer to output
  1283. * @return The function returns one of the two
  1284. * 1. The incremented output pointer for a successful operation or
  1285. * 2. NULL if implementation is not available.
  1286. *
  1287. * @details This function does the matrix multiplication of weight matrix for all output channels
  1288. * with 2 columns from im2col and produces two elements/output_channel. The outputs are
  1289. * clamped in the range provided by activation min and max.
  1290. * Supported framework: TensorFlow Lite micro.
  1291. */
  1292. q7_t *arm_nn_mat_mult_kernel_s8_s16(const q7_t *input_a,
  1293. const q15_t *input_b,
  1294. const uint16_t output_ch,
  1295. const int32_t *out_shift,
  1296. const int32_t *out_mult,
  1297. const int32_t out_offset,
  1298. const int16_t activation_min,
  1299. const int16_t activation_max,
  1300. const uint16_t num_col_a,
  1301. const int32_t *const output_bias,
  1302. q7_t *out_0);
  1303. /**
  1304. * @brief Matrix-multiplication of re-ordered input B with A.
  1305. *
  1306. * @details For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence
  1307. * of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range
  1308. * provided by activation min and max.
  1309. * * @details
  1310. * - Supported framework : TensorFlow Lite Micro
  1311. * - The following constrains on the arguments apply
  1312. * -# num_col_a is a multiple of 4
  1313. * -# output_ch is a multiple of 2
  1314. *
  1315. */
  1316. q7_t *arm_nn_mat_mult_kernel_s8_s16_reordered(const q7_t *input_a,
  1317. const q15_t *input_b,
  1318. const uint16_t output_ch,
  1319. const int32_t *out_shift,
  1320. const int32_t *out_mult,
  1321. const int32_t out_offset,
  1322. const int16_t activation_min,
  1323. const int16_t activation_max,
  1324. const uint16_t num_col_a,
  1325. const int32_t *const output_bias,
  1326. q7_t *out_0);
  1327. /**
  1328. *@brief Matrix-multiplication function for convolution with reordered columns
  1329. *@param[in] pA pointer to operand A
  1330. *@param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
  1331. *@param[in] ch_im_out numRow of A
  1332. *@param[in] numCol_A numCol of A
  1333. *@param[in] bias_shift amount of left-shift for bias
  1334. *@param[in] out_shift amount of right-shift for output
  1335. *@param[in] bias the bias
  1336. *@param[in,out] pOut pointer to output
  1337. *@return The function returns the incremented output pointer
  1338. *
  1339. *@details This function assumes that data in pInBuffer are reordered
  1340. */
  1341. q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t *pA,
  1342. const q15_t *pInBuffer,
  1343. const uint16_t ch_im_out,
  1344. const uint16_t numCol_A,
  1345. const uint16_t bias_shift,
  1346. const uint16_t out_shift,
  1347. const q7_t *bias,
  1348. q7_t *pOut);
  1349. #ifdef __cplusplus
  1350. }
  1351. #endif
  1352. /*
  1353. * Other functions
  1354. * These layers are typically not timing critical
  1355. * Basic implementation is supported here
  1356. */
  1357. #ifdef __cplusplus
  1358. extern "C" {
  1359. #endif
  1360. /**
  1361. * @defgroup BasicMath Basic math functions
  1362. *
  1363. * Element wise add and multiplication functions.
  1364. *
  1365. */
  1366. /**
  1367. * @brief s8 element wise add of two vectors
  1368. * @param[in] input_1_vect pointer to input vector 1
  1369. * @param[in] input_2_vect pointer to input vector 2
  1370. * @param[in] input_1_offset offset for input 1. Range: Range: -127 to 128
  1371. * @param[in] input_1_mult multiplier for input 1
  1372. * @param[in] input_1_shift shift for input 1
  1373. * @param[in] input_2_offset offset for input 2. Range: Range: -127 to 128
  1374. * @param[in] input_2_mult multiplier for input 2
  1375. * @param[in] input_2_shift shift for input 2
  1376. * @param[in] left_shift input left shift
  1377. * @param[in,out] output pointer to output vector
  1378. * @param[in] out_offset output offset
  1379. * @param[in] out_mult output multiplier
  1380. * @param[in] out_shift output shift
  1381. * @param[in] out_activation_min minimum value to clamp output to
  1382. * @param[in] out_activation_max maximum value to clamp output to
  1383. * @param[in] block_size number of samples
  1384. * @return The function returns ARM_MATH_SUCCESS
  1385. */
  1386. arm_status arm_elementwise_add_s8(const int8_t *input_1_vect,
  1387. const int8_t *input_2_vect,
  1388. const int32_t input_1_offset,
  1389. const int32_t input_1_mult,
  1390. const int32_t input_1_shift,
  1391. const int32_t input_2_offset,
  1392. const int32_t input_2_mult,
  1393. const int32_t input_2_shift,
  1394. const int32_t left_shift,
  1395. int8_t *output,
  1396. const int32_t out_offset,
  1397. const int32_t out_mult,
  1398. const int32_t out_shift,
  1399. const int32_t out_activation_min,
  1400. const int32_t out_activation_max,
  1401. const uint32_t block_size);
  1402. /**
  1403. * @brief s8 element wise multiplication
  1404. * @param[in] input_1_vect pointer to input vector 1
  1405. * @param[in] input_2_vect pointer to input vector 2
  1406. * @param[in] input_1_offset offset for input 1. Range: Range: -127 to 128
  1407. * @param[in] input_2_offset offset for input 2. Range: Range: -127 to 128
  1408. * @param[in,out] output pointer to output vector
  1409. * @param[in] out_offset output offset
  1410. * @param[in] out_mult output multiplier
  1411. * @param[in] out_shift output shift
  1412. * @param[in] out_activation_min minimum value to clamp output to
  1413. * @param[in] out_activation_max maximum value to clamp output to
  1414. * @param[in] block_size number of samples
  1415. * @return The function returns ARM_MATH_SUCCESS
  1416. *
  1417. * @details Supported framework: TensorFlow Lite micro
  1418. */
  1419. arm_status arm_elementwise_mul_s8(const int8_t *input_1_vect,
  1420. const int8_t *input_2_vect,
  1421. const int32_t input_1_offset,
  1422. const int32_t input_2_offset,
  1423. int8_t *output,
  1424. const int32_t out_offset,
  1425. const int32_t out_mult,
  1426. const int32_t out_shift,
  1427. const int32_t out_activation_min,
  1428. const int32_t out_activation_max,
  1429. const uint32_t block_size);
  1430. /**
  1431. * @defgroup Acti Activation Functions
  1432. *
  1433. * Perform activation layers, including ReLU (Rectified Linear Unit),
  1434. * sigmoid and tanh
  1435. *
  1436. */
  1437. /**
  1438. * @brief Q7 RELU function
  1439. * @param[in,out] data pointer to input
  1440. * @param[in] size number of elements
  1441. * @return none.
  1442. */
  1443. void arm_relu_q7(q7_t *data, uint16_t size);
  1444. /**
  1445. * @brief s8 ReLU6 function
  1446. * @param[in,out] data pointer to input
  1447. * @param[in] size number of elements
  1448. */
  1449. void arm_relu6_s8(q7_t *data, uint16_t size);
  1450. /**
  1451. * @brief Q15 RELU function
  1452. * @param[in,out] data pointer to input
  1453. * @param[in] size number of elements
  1454. * @return none.
  1455. */
  1456. void arm_relu_q15(q15_t *data, uint16_t size);
  1457. /**
  1458. * @brief Q7 neural network activation function using direct table look-up
  1459. * @param[in,out] data pointer to input
  1460. * @param[in] size number of elements
  1461. * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
  1462. * @param[in] type type of activation functions
  1463. * @return none.
  1464. */
  1465. void arm_nn_activations_direct_q7(q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
  1466. /**
  1467. * @brief Q15 neural network activation function using direct table look-up
  1468. * @param[in,out] data pointer to input
  1469. * @param[in] size number of elements
  1470. * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
  1471. * @param[in] type type of activation functions
  1472. * @return none.
  1473. *
  1474. * @details
  1475. *
  1476. * This is the direct table look-up approach.
  1477. *
  1478. * Assume here the integer part of the fixed-point is <= 3.
  1479. * More than 3 just not making much sense, makes no difference with
  1480. * saturation followed by any of these activation functions.
  1481. */
  1482. void arm_nn_activations_direct_q15(q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type);
  1483. /**
  1484. * @defgroup Pooling Pooling Functions
  1485. *
  1486. * Perform pooling functions, including max pooling and average pooling
  1487. *
  1488. */
  1489. /**
  1490. * @brief Q7 max pooling function
  1491. * @param[in] Im_in pointer to input tensor
  1492. * @param[in] dim_im_in input tensor dimension
  1493. * @param[in] ch_im_in number of input tensor channels
  1494. * @param[in] dim_kernel filter kernel size
  1495. * @param[in] padding padding sizes
  1496. * @param[in] stride convolution stride
  1497. * @param[in] dim_im_out output tensor dimension
  1498. * @param[in,out] bufferA pointer to buffer space for input
  1499. * @param[in,out] Im_out pointer to output tensor
  1500. * @return none.
  1501. *
  1502. */
  1503. void arm_maxpool_q7_HWC(q7_t *Im_in,
  1504. const uint16_t dim_im_in,
  1505. const uint16_t ch_im_in,
  1506. const uint16_t dim_kernel,
  1507. const uint16_t padding,
  1508. const uint16_t stride,
  1509. const uint16_t dim_im_out,
  1510. q7_t *bufferA,
  1511. q7_t *Im_out);
  1512. /**
  1513. * @brief Q7 average pooling function
  1514. * @param[in] Im_in pointer to input tensor
  1515. * @param[in] dim_im_in input tensor dimension
  1516. * @param[in] ch_im_in number of input tensor channels
  1517. * @param[in] dim_kernel filter kernel size
  1518. * @param[in] padding padding sizes
  1519. * @param[in] stride convolution stride
  1520. * @param[in] dim_im_out output tensor dimension
  1521. * @param[in,out] bufferA pointer to buffer space for input
  1522. * @param[in,out] Im_out pointer to output tensor
  1523. * @return none.
  1524. *
  1525. */
  1526. void arm_avepool_q7_HWC(q7_t *Im_in,
  1527. const uint16_t dim_im_in,
  1528. const uint16_t ch_im_in,
  1529. const uint16_t dim_kernel,
  1530. const uint16_t padding,
  1531. const uint16_t stride,
  1532. const uint16_t dim_im_out,
  1533. q7_t *bufferA,
  1534. q7_t *Im_out);
  1535. /**
  1536. * @brief s8 average pooling function.
  1537. *
  1538. * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
  1539. * definition file to see if an additional buffer is required.
  1540. * Optional function {API}_get_buffer_size() provides the buffer
  1541. * size if an additional buffer is required.
  1542. * @param[in] pool_params Pooling parameters
  1543. * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
  1544. * Argument 'N' is not used.
  1545. * @param[in] input_data Input (activation) data pointer. Data type: int8
  1546. * @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
  1547. * Argument N and C are not used.
  1548. * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
  1549. * Argument N is not used.
  1550. * C_OUT equals C_IN.
  1551. * @param[in, out] output_data Output data pointer. Data type: int8
  1552. * @return The function returns
  1553. * <code>ARM_MATH_SUCCESS</code> - Successful operation
  1554. *
  1555. * @details
  1556. * - Supported Framework: TensorFlow Lite
  1557. *
  1558. */
  1559. arm_status arm_avgpool_s8(const cmsis_nn_context *ctx,
  1560. const cmsis_nn_pool_params *pool_params,
  1561. const cmsis_nn_dims *input_dims,
  1562. const q7_t *input_data,
  1563. const cmsis_nn_dims *filter_dims,
  1564. const cmsis_nn_dims *output_dims,
  1565. q7_t *output_data);
  1566. /**
  1567. * @brief Get the required buffer size for S8 average pooling function
  1568. * @param[in] dim_dst_width output tensor dimension
  1569. * @param[in] ch_src number of input tensor channels
  1570. * @return The function returns required buffer size in bytes
  1571. *
  1572. */
  1573. int32_t arm_avgpool_s8_get_buffer_size(const int dim_dst_width, const int ch_src);
  1574. /**
  1575. * @brief s8 max pooling function.
  1576. *
  1577. * @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
  1578. * definition file to see if an additional buffer is required.
  1579. * Optional function {API}_get_buffer_size() provides the buffer
  1580. * size if an additional buffer is required.
  1581. * @param[in] pool_params Pooling parameters
  1582. * @param[in] input_dims Input (activation) tensor dimensions. Format: [H, W, C_IN]
  1583. * Argument 'N' is not used.
  1584. * @param[in] input_data Input (activation) data pointer. Data type: int8
  1585. * @param[in] filter_dims Filter tensor dimensions. Format: [H, W]
  1586. * Argument N and C are not used.
  1587. * @param[in] output_dims Output tensor dimensions. Format: [H, W, C_OUT]
  1588. * Argument N is not used.
  1589. * C_OUT equals C_IN.
  1590. * @param[in, out] output_data Output data pointer. Data type: int8
  1591. * @return The function returns
  1592. * <code>ARM_MATH_SUCCESS</code> - Successful operation
  1593. *
  1594. * @details
  1595. * - Supported Framework: TensorFlow Lite
  1596. *
  1597. */
  1598. arm_status arm_max_pool_s8(const cmsis_nn_context *ctx,
  1599. const cmsis_nn_pool_params *pool_params,
  1600. const cmsis_nn_dims *input_dims,
  1601. const q7_t *input_data,
  1602. const cmsis_nn_dims *filter_dims,
  1603. const cmsis_nn_dims *output_dims,
  1604. q7_t *output_data);
  1605. /**
  1606. * @defgroup Softmax Softmax Functions
  1607. *
  1608. * EXP(2) based softmax functions.
  1609. *
  1610. */
  1611. /**
  1612. * @brief Q7 softmax function
  1613. * @param[in] vec_in pointer to input vector
  1614. * @param[in] dim_vec input vector dimension
  1615. * @param[out] p_out pointer to output vector
  1616. *
  1617. * @note This function is an optimized version which is not bit-accurate with
  1618. * TensorFlow Lite's kernel
  1619. *
  1620. */
  1621. void arm_softmax_q7(const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out);
  1622. /**
  1623. * @brief Q7 softmax function with batch parameter
  1624. * @param[in] vec_in pointer to input vector
  1625. * @param[in] nb_batches number of batches
  1626. * @param[in] dim_vec input vector dimension
  1627. * @param[out] p_out pointer to output vector
  1628. * @return none.
  1629. *
  1630. * @note This function is an optimized version which is not bit-accurate with
  1631. * TensorFlow Lite's kernel
  1632. *
  1633. */
  1634. void arm_softmax_with_batch_q7(const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out);
  1635. /**
  1636. * @brief Q15 softmax function
  1637. * @param[in] vec_in pointer to input vector
  1638. * @param[in] dim_vec input vector dimension
  1639. * @param[out] p_out pointer to output vector
  1640. * @return none.
  1641. *
  1642. * @note This function is an optimized version which is not bit-accurate with
  1643. * TensorFlow Lite's kernel
  1644. *
  1645. */
  1646. void arm_softmax_q15(const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out);
  1647. /**
  1648. * @brief S8 softmax function
  1649. * @param[in] input Pointer to the input tensor
  1650. * @param[in] num_rows Number of rows in the input tensor
  1651. * @param[in] row_size Number of elements in each input row
  1652. * @param[in] mult Input quantization multiplier
  1653. * @param[in] shift Input quantization shift within the range [0, 31]
  1654. * @param[in] diff_min Minimum difference with max in row. Used to check if
  1655. * the quantized exponential operation can be performed
  1656. * @param[out] output Pointer to the output tensor
  1657. *
  1658. * @note Supported framework: TensorFlow Lite micro (bit-accurate)
  1659. *
  1660. */
  1661. void arm_softmax_s8(const int8_t *input,
  1662. const int32_t num_rows,
  1663. const int32_t row_size,
  1664. const int32_t mult,
  1665. const int32_t shift,
  1666. const int32_t diff_min,
  1667. int8_t *output);
  1668. /**
  1669. * @brief U8 softmax function
  1670. * @param[in] input Pointer to the input tensor
  1671. * @param[in] num_rows Number of rows in the input tensor
  1672. * @param[in] row_size Number of elements in each input row
  1673. * @param[in] mult Input quantization multiplier
  1674. * @param[in] shift Input quantization shift within the range [0, 31]
  1675. * @param[in] diff_min Minimum difference with max in row. Used to check if
  1676. * the quantized exponential operation can be performed
  1677. * @param[out] output Pointer to the output tensor
  1678. *
  1679. * @note Supported framework: TensorFlow Lite micro (bit-accurate)
  1680. *
  1681. */
  1682. void arm_softmax_u8(const uint8_t *input,
  1683. const int32_t num_rows,
  1684. const int32_t row_size,
  1685. const int32_t mult,
  1686. const int32_t shift,
  1687. const int32_t diff_min,
  1688. uint8_t *output);
  1689. /**
  1690. * @brief uint8 depthwise convolution function with asymmetric quantization
  1691. * Unless specified otherwise, arguments are mandatory.
  1692. *
  1693. * @param[in] input Pointer to input tensor
  1694. * @param[in] input_x Width of input tensor
  1695. * @param[in] input_y Height of input tensor
  1696. * @param[in] input_ch Channels in input tensor
  1697. * @param[in] kernel Pointer to kernel weights
  1698. * @param[in] kernel_x Width of kernel
  1699. * @param[in] kernel_y Height of kernel
  1700. * @param[in] ch_mult Number of channel multiplier
  1701. * @param[in] pad_x Padding sizes x
  1702. * @param[in] pad_y Padding sizes y
  1703. * @param[in] stride_x stride along the width
  1704. * @param[in] stride_y stride along the height
  1705. * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement.
  1706. * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement.
  1707. * @param[in] bias Pointer to optional bias values. If no bias is
  1708. * availble, NULL is expected
  1709. * @param[in] input_offset Input tensor zero offset
  1710. * @param[in] filter_offset Kernel tensor zero offset
  1711. * @param[in] output_offset Output tensor zero offset
  1712. * @param[in,out] output Pointer to output tensor
  1713. * @param[in] output_x Width of output tensor
  1714. * @param[in] output_y Height of output tensor
  1715. * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255}
  1716. * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255}
  1717. * @param[in] out_shift Amount of right-shift for output
  1718. * @param[in] out_mult Output multiplier for requantization
  1719. * @return The function returns the following
  1720. * <code>ARM_MATH_SUCCESS</code> - Successful operation
  1721. *
  1722. */
  1723. arm_status arm_depthwise_conv_u8_basic_ver1(const uint8_t *input,
  1724. const uint16_t input_x,
  1725. const uint16_t input_y,
  1726. const uint16_t input_ch,
  1727. const uint8_t *kernel,
  1728. const uint16_t kernel_x,
  1729. const uint16_t kernel_y,
  1730. const int16_t ch_mult,
  1731. const int16_t pad_x,
  1732. const int16_t pad_y,
  1733. const int16_t stride_x,
  1734. const int16_t stride_y,
  1735. const int16_t dilation_x,
  1736. const int16_t dilation_y,
  1737. const int32_t *bias,
  1738. const int32_t input_offset,
  1739. const int32_t filter_offset,
  1740. const int32_t output_offset,
  1741. uint8_t *output,
  1742. const uint16_t output_x,
  1743. const uint16_t output_y,
  1744. const int32_t output_activation_min,
  1745. const int32_t output_activation_max,
  1746. const int32_t out_shift,
  1747. const int32_t out_mult);
  1748. /**
  1749. * @defgroup Reshape Reshape Functions
  1750. *
  1751. */
  1752. /**
  1753. * @brief Reshape a s8 vector into another with different shape
  1754. * @param[in] input points to the s8 input vector
  1755. * @param[out] output points to the s8 output vector
  1756. * @param[in] total_size total size of the input and output vectors in bytes
  1757. *
  1758. * @note The output is expected to be in a memory area that does not overlap with the input's
  1759. *
  1760. */
  1761. void arm_reshape_s8(const int8_t *input, int8_t *output, const uint32_t total_size);
  1762. /**
  1763. * @defgroup Concatenation Concatenation Functions
  1764. *
  1765. */
  1766. /**
  1767. * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis
  1768. * This function should be called for each input tensor to concatenate. The argument offset_x
  1769. * will be used to store the input tensor in the correct position in the output tensor
  1770. *
  1771. * i.e. offset_x = 0
  1772. * for(i = 0 i < num_input_tensors; ++i)
  1773. * {
  1774. * arm_concatenation_s8_x(&input[i], ..., &output, ..., ..., offset_x)
  1775. * offset_x += input_x[i]
  1776. * }
  1777. *
  1778. * This function assumes that the output tensor has:
  1779. * -# The same height of the input tensor
  1780. * -# The same number of channels of the input tensor
  1781. * -# The same batch size of the input tensor
  1782. *
  1783. * Unless specified otherwise, arguments are mandatory.
  1784. *
  1785. * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
  1786. * does not involve any arithmetic operation
  1787. *
  1788. * @param[in] input Pointer to input tensor
  1789. * @param[in] input_x Width of input tensor
  1790. * @param[in] input_y Height of input tensor
  1791. * @param[in] input_z Channels in input tensor
  1792. * @param[in] input_w Batch size in input tensor
  1793. * @param[out] output Pointer to output tensor
  1794. * @param[in] output_x Width of output tensor
  1795. * @param[in] offset_x The offset (in number of elements) on the X axis to start concatenating the input tensor
  1796. * It is user responsibility to provide the correct value
  1797. *
  1798. * <b> Input constraints</b>
  1799. * offset_x is less than output_x
  1800. *
  1801. */
  1802. void arm_concatenation_s8_x(const int8_t *input,
  1803. const uint16_t input_x,
  1804. const uint16_t input_y,
  1805. const uint16_t input_z,
  1806. const uint16_t input_w,
  1807. int8_t *output,
  1808. const uint16_t output_x,
  1809. const uint32_t offset_x);
  1810. /**
  1811. * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis
  1812. * This function should be called for each input tensor to concatenate. The argument offset_y
  1813. * will be used to store the input tensor in the correct position in the output tensor
  1814. *
  1815. * i.e. offset_y = 0
  1816. * for(i = 0 i < num_input_tensors; ++i)
  1817. * {
  1818. * arm_concatenation_s8_y(&input[i], ..., &output, ..., ..., offset_y)
  1819. * offset_y += input_y[i]
  1820. * }
  1821. *
  1822. * This function assumes that the output tensor has:
  1823. * -# The same width of the input tensor
  1824. * -# The same number of channels of the input tensor
  1825. * -# The same batch size of the input tensor
  1826. *
  1827. * Unless specified otherwise, arguments are mandatory.
  1828. *
  1829. * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
  1830. * does not involve any arithmetic operation
  1831. *
  1832. * @param[in] input Pointer to input tensor
  1833. * @param[in] input_x Width of input tensor
  1834. * @param[in] input_y Height of input tensor
  1835. * @param[in] input_z Channels in input tensor
  1836. * @param[in] input_w Batch size in input tensor
  1837. * @param[out] output Pointer to output tensor
  1838. * @param[in] output_y Height of output tensor
  1839. * @param[in] offset_y The offset on the Y axis to start concatenating the input tensor
  1840. * It is user responsibility to provide the correct value
  1841. *
  1842. * <b> Input constraints</b>
  1843. * offset_y is less than output_y
  1844. *
  1845. */
  1846. void arm_concatenation_s8_y(const int8_t *input,
  1847. const uint16_t input_x,
  1848. const uint16_t input_y,
  1849. const uint16_t input_z,
  1850. const uint16_t input_w,
  1851. int8_t *output,
  1852. const uint16_t output_y,
  1853. const uint32_t offset_y);
  1854. /**
  1855. * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis
  1856. * This function should be called for each input tensor to concatenate. The argument offset_z
  1857. * will be used to store the input tensor in the correct position in the output tensor
  1858. *
  1859. * i.e. offset_z = 0
  1860. * for(i = 0 i < num_input_tensors; ++i)
  1861. * {
  1862. * arm_concatenation_s8_z(&input[i], ..., &output, ..., ..., offset_z)
  1863. * offset_z += input_z[i]
  1864. * }
  1865. *
  1866. * This function assumes that the output tensor has:
  1867. * -# The same width of the input tensor
  1868. * -# The same height of the input tensor
  1869. * -# The same batch size of the input tensor
  1870. *
  1871. * Unless specified otherwise, arguments are mandatory.
  1872. *
  1873. * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
  1874. * does not involve any arithmetic operation
  1875. *
  1876. * @param[in] input Pointer to input tensor
  1877. * @param[in] input_x Width of input tensor
  1878. * @param[in] input_y Height of input tensor
  1879. * @param[in] input_z Channels in input tensor
  1880. * @param[in] input_w Batch size in input tensor
  1881. * @param[out] output Pointer to output tensor
  1882. * @param[in] output_z Channels in output tensor
  1883. * @param[in] offset_z The offset on the Z axis to start concatenating the input tensor
  1884. * It is user responsibility to provide the correct value
  1885. *
  1886. * <b> Input constraints</b>
  1887. * offset_z is less than output_z
  1888. *
  1889. */
  1890. void arm_concatenation_s8_z(const int8_t *input,
  1891. const uint16_t input_x,
  1892. const uint16_t input_y,
  1893. const uint16_t input_z,
  1894. const uint16_t input_w,
  1895. int8_t *output,
  1896. const uint16_t output_z,
  1897. const uint32_t offset_z);
  1898. /**
  1899. * @brief int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size)
  1900. * This function should be called for each input tensor to concatenate. The argument offset_w
  1901. * will be used to store the input tensor in the correct position in the output tensor
  1902. *
  1903. * i.e. offset_w = 0
  1904. * for(i = 0 i < num_input_tensors; ++i)
  1905. * {
  1906. * arm_concatenation_s8_w(&input[i], ..., &output, ..., ..., offset_w)
  1907. * offset_w += input_w[i]
  1908. * }
  1909. *
  1910. * This function assumes that the output tensor has:
  1911. * -# The same width of the input tensor
  1912. * -# The same height of the input tensor
  1913. * -# The same number o channels of the input tensor
  1914. *
  1915. * Unless specified otherwise, arguments are mandatory.
  1916. *
  1917. * @note This function, data layout independent, can be used to concatenate either int8 or uint8 tensors because it
  1918. * does not involve any arithmetic operation
  1919. *
  1920. * @param[in] input Pointer to input tensor
  1921. * @param[in] input_x Width of input tensor
  1922. * @param[in] input_y Height of input tensor
  1923. * @param[in] input_z Channels in input tensor
  1924. * @param[in] input_w Batch size in input tensor
  1925. * @param[out] output Pointer to output tensor
  1926. * @param[in] offset_w The offset on the W axis to start concatenating the input tensor
  1927. * It is user responsibility to provide the correct value
  1928. *
  1929. */
  1930. void arm_concatenation_s8_w(const int8_t *input,
  1931. const uint16_t input_x,
  1932. const uint16_t input_y,
  1933. const uint16_t input_z,
  1934. const uint16_t input_w,
  1935. int8_t *output,
  1936. const uint32_t offset_w);
  1937. /**
  1938. * @defgroup SVDF SVDF Layer Functions
  1939. *
  1940. */
  1941. /**
  1942. * @brief s8 SVDF function
  1943. *
  1944. * @param[in] input_ctx Temporary scratch buffer
  1945. * @param[in] output_ctx Temporary output scratch buffer
  1946. * @param[in] svdf_params SVDF Parameters
  1947. * Range of svdf_params->input_offset : [-128, 127]
  1948. * Range of svdf_params->output_offset : [-128, 127]
  1949. * @param[in] input_quant_params Input quantization parameters
  1950. * @param[in] output_quant_params Output quantization parameters
  1951. * @param[in] input_dims Input tensor dimensions
  1952. * @param[in] input_data Pointer to input tensor
  1953. * @param[in] state_dims State tensor dimensions
  1954. * @param[in] state_data Pointer to state tensor
  1955. * @param[in] weights_feature_dims Weights (feature) tensor dimensions
  1956. * @param[in] weights_feature_data Pointer to the weights (feature) tensor
  1957. * @param[in] weights_time_dims Weights (time) tensor dimensions
  1958. * @param[in] weights_time_data Pointer to the weights (time) tensor
  1959. * @param[in] bias_dims Bias tensor dimensions
  1960. * @param[in] bias_data Pointer to bias tensor
  1961. * @param[in] output_dims Output tensor dimensions
  1962. * @param[out] output_data Pointer to the output tensor
  1963. *
  1964. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  1965. *
  1966. * @details
  1967. * 1. Supported framework: TensorFlow Lite micro
  1968. * 2. q7 is used as data type eventhough it is s8 data. It is done so to be consistent with existing APIs.
  1969. *
  1970. */
  1971. arm_status arm_svdf_s8(const cmsis_nn_context *input_ctx,
  1972. const cmsis_nn_context *output_ctx,
  1973. const cmsis_nn_svdf_params *svdf_params,
  1974. const cmsis_nn_per_tensor_quant_params *input_quant_params,
  1975. const cmsis_nn_per_tensor_quant_params *output_quant_params,
  1976. const cmsis_nn_dims *input_dims,
  1977. const q7_t *input_data,
  1978. const cmsis_nn_dims *state_dims,
  1979. q15_t *state_data,
  1980. const cmsis_nn_dims *weights_feature_dims,
  1981. const q7_t *weights_feature_data,
  1982. const cmsis_nn_dims *weights_time_dims,
  1983. const q15_t *weights_time_data,
  1984. const cmsis_nn_dims *bias_dims,
  1985. const q31_t *bias_data,
  1986. const cmsis_nn_dims *output_dims,
  1987. q7_t *output_data);
  1988. #ifdef __cplusplus
  1989. }
  1990. #endif
  1991. #endif