arm_nnfunctions.h 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. /*
  2. * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_nnfunctions.h
  21. * Description: Public header file for CMSIS NN Library
  22. *
  23. * $Date: 17. January 2018
  24. * $Revision: V.1.0.0
  25. *
  26. * Target Processor: Cortex-M cores
  27. * -------------------------------------------------------------------- */
  28. /**
  29. \mainpage CMSIS NN Software Library
  30. *
  31. * Introduction
  32. * ------------
  33. *
  34. * This user manual describes the CMSIS NN software library,
  35. * a collection of efficient neural network kernels developed to maximize the
  36. * performance and minimize the memory footprint of neural networks on Cortex-M processor cores.
  37. *
  38. * The library is divided into a number of functions each covering a specific category:
  39. * - Neural Network Convolution Functions
  40. * - Neural Network Activation Functions
  41. * - Fully-connected Layer Functions
  42. * - Neural Network Pooling Functions
  43. * - Softmax Functions
  44. * - Neural Network Support Functions
  45. *
  46. * The library has separate functions for operating on different weight and activation data
  47. * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
  48. * kernels are included in the function description. The implementation details are also
  49. * described in this paper [1].
  50. *
  51. * Block Diagram
  52. * --------
  53. * \image html CMSIS-NN-OVERVIEW.PNG
  54. *
  55. * Examples
  56. * --------
  57. *
  58. * The library ships with a number of examples which demonstrate how to use the library functions.
  59. *
  60. * Pre-processor Macros
  61. * ------------
  62. *
  63. * Each library project have differant pre-processor macros.
  64. *
  65. * - ARM_MATH_DSP:
  66. *
  67. * Define macro ARM_MATH_DSP, If the silicon supports DSP instructions.
  68. *
  69. * - ARM_MATH_BIG_ENDIAN:
  70. *
  71. * Define macro ARM_MATH_BIG_ENDIAN to build the library for big endian targets. By default library builds for little endian targets.
  72. *
  73. * - ARM_NN_TRUNCATE:
  74. *
  75. * Define macro ARM_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
  76. *
  77. * Copyright Notice
  78. * ------------
  79. *
  80. * Copyright (C) 2010-2018 Arm Limited. All rights reserved.
  81. *
  82. * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
  83. */
  84. /**
  85. * @defgroup groupNN Neural Network Functions
  86. * These functions perform basic operations for neural network layers.
  87. */
  88. #ifndef _ARM_NNFUNCTIONS_H
  89. #define _ARM_NNFUNCTIONS_H
  90. #include "arm_nnsupportfunctions.h"
  91. #include "arm_nn_tables.h"
  92. #define USE_INTRINSIC
  93. //#define ARM_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
  94. #ifdef __cplusplus
  95. extern "C"
  96. {
  97. #endif
  98. /**
  99. * @defgroup NNConv Neural Network Convolution Functions
  100. *
  101. * Perform convolution layer
  102. *
  103. * The convolution is implemented in 2 steps: im2col and GEMM
  104. *
  105. * im2col is a process of converting each patch of image data into
  106. * a column. After im2col, the convolution is computed as matrix-matrix
  107. * multiplication.
  108. *
  109. * To reduce the memory footprint, the im2col is performed partially.
  110. * Each iteration, only a few column (i.e., patches) are generated and
  111. * computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
  112. *
  113. */
  114. /**
  115. * @brief Basic Q7 convolution function
  116. * @param[in] Im_in pointer to input tensor
  117. * @param[in] dim_im_in input tensor dimention
  118. * @param[in] ch_im_in number of input tensor channels
  119. * @param[in] wt pointer to kernel weights
  120. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  121. * @param[in] dim_kernel filter kernel size
  122. * @param[in] padding padding sizes
  123. * @param[in] stride convolution stride
  124. * @param[in] bias pointer to bias
  125. * @param[in] bias_shift amount of left-shift for bias
  126. * @param[in] out_shift amount of right-shift for output
  127. * @param[in,out] Im_out pointer to output tensor
  128. * @param[in] dim_im_out output tensor dimension
  129. * @param[in,out] bufferA pointer to buffer space for input
  130. * @param[in,out] bufferB pointer to buffer space for output
  131. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  132. *
  133. */
  134. arm_status arm_convolve_HWC_q7_basic(const q7_t * Im_in,
  135. const uint16_t dim_im_in,
  136. const uint16_t ch_im_in,
  137. const q7_t * wt,
  138. const uint16_t ch_im_out,
  139. const uint16_t dim_kernel,
  140. const uint16_t padding,
  141. const uint16_t stride,
  142. const q7_t * bias,
  143. const uint16_t bias_shift,
  144. const uint16_t out_shift,
  145. q7_t * Im_out,
  146. const uint16_t dim_im_out,
  147. q15_t * bufferA,
  148. q7_t * bufferB);
  149. /**
  150. * @brief Basic Q15 convolution function
  151. * @param[in] Im_in pointer to input tensor
  152. * @param[in] dim_im_in input tensor dimention
  153. * @param[in] ch_im_in number of input tensor channels
  154. * @param[in] wt pointer to kernel weights
  155. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  156. * @param[in] dim_kernel filter kernel size
  157. * @param[in] padding padding sizes
  158. * @param[in] stride convolution stride
  159. * @param[in] bias pointer to bias
  160. * @param[in] bias_shift amount of left-shift for bias
  161. * @param[in] out_shift amount of right-shift for output
  162. * @param[in,out] Im_out pointer to output tensor
  163. * @param[in] dim_im_out output tensor dimension
  164. * @param[in,out] bufferA pointer to buffer space for input
  165. * @param[in,out] bufferB pointer to buffer space for output
  166. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  167. *
  168. */
  169. arm_status arm_convolve_HWC_q15_basic(const q15_t * Im_in,
  170. const uint16_t dim_im_in,
  171. const uint16_t ch_im_in,
  172. const q15_t * wt,
  173. const uint16_t ch_im_out,
  174. const uint16_t dim_kernel,
  175. const uint16_t padding,
  176. const uint16_t stride,
  177. const q15_t * bias,
  178. const uint16_t bias_shift,
  179. const uint16_t out_shift,
  180. q15_t * Im_out,
  181. const uint16_t dim_im_out,
  182. q15_t * bufferA,
  183. q7_t * bufferB);
  184. /**
  185. * @brief Fast Q7 convolution function
  186. * @param[in] Im_in pointer to input tensor
  187. * @param[in] dim_im_in input tensor dimention
  188. * @param[in] ch_im_in number of input tensor channels
  189. * @param[in] wt pointer to kernel weights
  190. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  191. * @param[in] dim_kernel filter kernel size
  192. * @param[in] padding padding sizes
  193. * @param[in] stride convolution stride
  194. * @param[in] bias pointer to bias
  195. * @param[in] bias_shift amount of left-shift for bias
  196. * @param[in] out_shift amount of right-shift for output
  197. * @param[in,out] Im_out pointer to output tensor
  198. * @param[in] dim_im_out output tensor dimension
  199. * @param[in,out] bufferA pointer to buffer space for input
  200. * @param[in,out] bufferB pointer to buffer space for output
  201. * @return The function returns either
  202. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  203. *
  204. * This function is the version with full list of optimization tricks, but with
  205. * some contraints:
  206. * ch_im_in is multiple of 4
  207. * ch_im_out is multiple of 2
  208. */
  209. arm_status arm_convolve_HWC_q7_fast(const q7_t * Im_in,
  210. const uint16_t dim_im_in,
  211. const uint16_t ch_im_in,
  212. const q7_t * wt,
  213. const uint16_t ch_im_out,
  214. const uint16_t dim_kernel,
  215. const uint16_t padding,
  216. const uint16_t stride,
  217. const q7_t * bias,
  218. const uint16_t bias_shift,
  219. const uint16_t out_shift,
  220. q7_t * Im_out,
  221. const uint16_t dim_im_out,
  222. q15_t * bufferA,
  223. q7_t * bufferB);
  224. /**
  225. * @brief Fast Q7 convolution function (non-sqaure shape)
  226. * @param[in] Im_in pointer to input tensor
  227. * @param[in] dim_im_in_x input tensor dimention x
  228. * @param[in] dim_im_in_y input tensor dimention y
  229. * @param[in] ch_im_in number of input tensor channels
  230. * @param[in] wt pointer to kernel weights
  231. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  232. * @param[in] dim_kernel_x filter kernel size x
  233. * @param[in] dim_kernel_y filter kernel size y
  234. * @param[in] padding_x padding size x
  235. * @param[in] padding_y padding size y
  236. * @param[in] stride_x convolution stride x
  237. * @param[in] stride_y convolution stride y
  238. * @param[in] bias pointer to bias
  239. * @param[in] bias_shift amount of left-shift for bias
  240. * @param[in] out_shift amount of right-shift for output
  241. * @param[in,out] Im_out pointer to output tensor
  242. * @param[in] dim_im_out_x output tensor dimension x
  243. * @param[in] dim_im_out_y output tensor dimension y
  244. * @param[in,out] bufferA pointer to buffer space for input
  245. * @param[in,out] bufferB pointer to buffer space for output
  246. * @return The function returns either
  247. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  248. *
  249. * This function is the version with full list of optimization tricks, but with
  250. * some contraints:
  251. * ch_im_in is multiple of 4
  252. * ch_im_out is multiple of 2
  253. */
  254. arm_status arm_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
  255. const uint16_t dim_im_in_x,
  256. const uint16_t dim_im_in_y,
  257. const uint16_t ch_im_in,
  258. const q7_t * wt,
  259. const uint16_t ch_im_out,
  260. const uint16_t dim_kernel_x,
  261. const uint16_t dim_kernel_y,
  262. const uint16_t padding_x,
  263. const uint16_t padding_y,
  264. const uint16_t stride_x,
  265. const uint16_t stride_y,
  266. const q7_t * bias,
  267. const uint16_t bias_shift,
  268. const uint16_t out_shift,
  269. q7_t * Im_out,
  270. const uint16_t dim_im_out_x,
  271. const uint16_t dim_im_out_y,
  272. q15_t * bufferA,
  273. q7_t * bufferB);
  274. /**
  275. * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
  276. * @param[in] Im_in pointer to input tensor
  277. * @param[in] dim_im_in_x input tensor dimention x
  278. * @param[in] dim_im_in_y input tensor dimention y
  279. * @param[in] ch_im_in number of input tensor channels
  280. * @param[in] wt pointer to kernel weights
  281. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  282. * @param[in] dim_kernel_x filter kernel size x
  283. * @param[in] dim_kernel_y filter kernel size y
  284. * @param[in] padding_x padding size x
  285. * @param[in] padding_y padding size y
  286. * @param[in] stride_x convolution stride x
  287. * @param[in] stride_y convolution stride y
  288. * @param[in] bias pointer to bias
  289. * @param[in] bias_shift amount of left-shift for bias
  290. * @param[in] out_shift amount of right-shift for output
  291. * @param[in,out] Im_out pointer to output tensor
  292. * @param[in] dim_im_out_x output tensor dimension x
  293. * @param[in] dim_im_out_y output tensor dimension y
  294. * @param[in,out] bufferA pointer to buffer space for input
  295. * @param[in,out] bufferB pointer to buffer space for output
  296. * @return The function returns either
  297. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  298. *
  299. * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
  300. * and dim_kernel_y=1). It can be used for
  301. * second half of MobileNets after depthwise separable convolution.
  302. *
  303. * This function is the version with full list of optimization tricks, but with
  304. * some contraints:
  305. * ch_im_in is multiple of 4
  306. * ch_im_out is multiple of 2
  307. */
  308. arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
  309. const uint16_t dim_im_in_x,
  310. const uint16_t dim_im_in_y,
  311. const uint16_t ch_im_in,
  312. const q7_t * wt,
  313. const uint16_t ch_im_out,
  314. const uint16_t dim_kernel_x,
  315. const uint16_t dim_kernel_y,
  316. const uint16_t padding_x,
  317. const uint16_t padding_y,
  318. const uint16_t stride_x,
  319. const uint16_t stride_y,
  320. const q7_t * bias,
  321. const uint16_t bias_shift,
  322. const uint16_t out_shift,
  323. q7_t * Im_out,
  324. const uint16_t dim_im_out_x,
  325. const uint16_t dim_im_out_y,
  326. q15_t * bufferA,
  327. q7_t * bufferB);
  328. /**
  329. * @brief Q7 version of convolution for RGB image
  330. * @param[in] Im_in pointer to input tensor
  331. * @param[in] dim_im_in input tensor dimention
  332. * @param[in] ch_im_in number of input tensor channels
  333. * @param[in] wt pointer to kernel weights
  334. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  335. * @param[in] dim_kernel filter kernel size
  336. * @param[in] padding padding sizes
  337. * @param[in] stride convolution stride
  338. * @param[in] bias pointer to bias
  339. * @param[in] bias_shift amount of left-shift for bias
  340. * @param[in] out_shift amount of right-shift for output
  341. * @param[in,out] Im_out pointer to output tensor
  342. * @param[in] dim_im_out output tensor dimension
  343. * @param[in,out] bufferA pointer to buffer space for input
  344. * @param[in,out] bufferB pointer to buffer space for output
  345. * @return The function returns either
  346. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  347. *
  348. * This kernel is written exclusively for convolution with ch_im_in
  349. * equals 3. This applies on the first layer of CNNs which has input
  350. * image with RGB format.
  351. */
  352. arm_status arm_convolve_HWC_q7_RGB(const q7_t * Im_in,
  353. const uint16_t dim_im_in,
  354. const uint16_t ch_im_in,
  355. const q7_t * wt,
  356. const uint16_t ch_im_out,
  357. const uint16_t dim_kernel,
  358. const uint16_t padding,
  359. const uint16_t stride,
  360. const q7_t * bias,
  361. const uint16_t bias_shift,
  362. const uint16_t out_shift,
  363. q7_t * Im_out,
  364. const uint16_t dim_im_out,
  365. q15_t * bufferA,
  366. q7_t * bufferB);
  367. /**
  368. * @brief Fast Q15 convolution function
  369. * @param[in] Im_in pointer to input tensor
  370. * @param[in] dim_im_in input tensor dimention
  371. * @param[in] ch_im_in number of input tensor channels
  372. * @param[in] wt pointer to kernel weights
  373. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  374. * @param[in] dim_kernel filter kernel size
  375. * @param[in] padding padding sizes
  376. * @param[in] stride convolution stride
  377. * @param[in] bias pointer to bias
  378. * @param[in] bias_shift amount of left-shift for bias
  379. * @param[in] out_shift amount of right-shift for output
  380. * @param[in,out] Im_out pointer to output tensor
  381. * @param[in] dim_im_out output tensor dimension
  382. * @param[in,out] bufferA pointer to buffer space for input
  383. * @param[in,out] bufferB pointer to buffer space for output
  384. * @return The function returns either
  385. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  386. *
  387. * This function is the version with full list of optimization tricks, but with
  388. * some contraints:
  389. * ch_im_in is multiple of 2
  390. * ch_im_out is multiple of 2
  391. */
  392. arm_status arm_convolve_HWC_q15_fast(const q15_t * Im_in,
  393. const uint16_t dim_im_in,
  394. const uint16_t ch_im_in,
  395. const q15_t * wt,
  396. const uint16_t ch_im_out,
  397. const uint16_t dim_kernel,
  398. const uint16_t padding,
  399. const uint16_t stride,
  400. const q15_t * bias,
  401. const uint16_t bias_shift,
  402. const uint16_t out_shift,
  403. q15_t * Im_out,
  404. const uint16_t dim_im_out,
  405. q15_t * bufferA,
  406. q7_t * bufferB);
  407. /**
  408. * @brief Q7 depthwise separable convolution function
  409. * @param[in] Im_in pointer to input tensor
  410. * @param[in] dim_im_in input tensor dimention
  411. * @param[in] ch_im_in number of input tensor channels
  412. * @param[in] wt pointer to kernel weights
  413. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  414. * @param[in] dim_kernel filter kernel size
  415. * @param[in] padding padding sizes
  416. * @param[in] stride convolution stride
  417. * @param[in] bias pointer to bias
  418. * @param[in] bias_shift amount of left-shift for bias
  419. * @param[in] out_shift amount of right-shift for output
  420. * @param[in,out] Im_out pointer to output tensor
  421. * @param[in] dim_im_out output tensor dimension
  422. * @param[in,out] bufferA pointer to buffer space for input
  423. * @param[in,out] bufferB pointer to buffer space for output
  424. * @return The function returns either
  425. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  426. *
  427. * This function is the version with full list of optimization tricks, but with
  428. * some contraints:
  429. * ch_im_in is multiple of 2
  430. * ch_im_out is multiple of 2
  431. */
  432. arm_status arm_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
  433. const uint16_t dim_im_in,
  434. const uint16_t ch_im_in,
  435. const q7_t * wt,
  436. const uint16_t ch_im_out,
  437. const uint16_t dim_kernel,
  438. const uint16_t padding,
  439. const uint16_t stride,
  440. const q7_t * bias,
  441. const uint16_t bias_shift,
  442. const uint16_t out_shift,
  443. q7_t * Im_out,
  444. const uint16_t dim_im_out,
  445. q15_t * bufferA,
  446. q7_t * bufferB);
  447. /**
  448. * @brief Q7 depthwise separable convolution function (non-square shape)
  449. * @param[in] Im_in pointer to input tensor
  450. * @param[in] dim_im_in_x input tensor dimention x
  451. * @param[in] dim_im_in_y input tensor dimention y
  452. * @param[in] ch_im_in number of input tensor channels
  453. * @param[in] wt pointer to kernel weights
  454. * @param[in] ch_im_out number of filters, i.e., output tensor channels
  455. * @param[in] dim_kernel_x filter kernel size x
  456. * @param[in] dim_kernel_y filter kernel size y
  457. * @param[in] padding_x padding sizes x
  458. * @param[in] padding_y padding sizes y
  459. * @param[in] stride_x convolution stride x
  460. * @param[in] stride_y convolution stride y
  461. * @param[in] bias pointer to bias
  462. * @param[in] bias_shift amount of left-shift for bias
  463. * @param[in] out_shift amount of right-shift for output
  464. * @param[in,out] Im_out pointer to output tensor
  465. * @param[in] dim_im_out_x output tensor dimension x
  466. * @param[in] dim_im_out_y output tensor dimension y
  467. * @param[in,out] bufferA pointer to buffer space for input
  468. * @param[in,out] bufferB pointer to buffer space for output
  469. * @return The function returns either
  470. * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
  471. *
  472. * This function is the version with full list of optimization tricks, but with
  473. * some contraints:
  474. * ch_im_in is multiple of 2
  475. * ch_im_out is multiple of 2
  476. */
  477. arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
  478. const uint16_t dim_im_in_x,
  479. const uint16_t dim_im_in_y,
  480. const uint16_t ch_im_in,
  481. const q7_t * wt,
  482. const uint16_t ch_im_out,
  483. const uint16_t dim_kernel_x,
  484. const uint16_t dim_kernel_y,
  485. const uint16_t padding_x,
  486. const uint16_t padding_y,
  487. const uint16_t stride_x,
  488. const uint16_t stride_y,
  489. const q7_t * bias,
  490. const uint16_t bias_shift,
  491. const uint16_t out_shift,
  492. q7_t * Im_out,
  493. const uint16_t dim_im_out_x,
  494. const uint16_t dim_im_out_y,
  495. q15_t * bufferA,
  496. q7_t * bufferB);
  497. /**
  498. * @defgroup FC Fully-connected Layer Functions
  499. *
  500. * Perform fully-connected layer
  501. *
  502. * Fully-connected layer is basically a matrix-vector multiplication
  503. * with bias. The matrix is the weights and the input/output vectors
  504. * are the activation values. Supported {weight, activation} precisions
  505. * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
  506. *
  507. * Here we have two types of kernel functions. The basic function
  508. * implements the function using regular GEMV approach. The opt functions
  509. * operates with weights in interleaved formats.
  510. *
  511. */
  512. /**
  513. * @brief Q7 basic fully-connected layer function
  514. * @param[in] pV pointer to input vector
  515. * @param[in] pM pointer to matrix weights
  516. * @param[in] dim_vec length of the vector
  517. * @param[in] num_of_rows number of rows in weight matrix
  518. * @param[in] bias_shift amount of left-shift for bias
  519. * @param[in] out_shift amount of right-shift for output
  520. * @param[in] bias pointer to bias
  521. * @param[in,out] pOut pointer to output vector
  522. * @param[in,out] vec_buffer pointer to buffer space for input
  523. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  524. *
  525. */
  526. arm_status arm_fully_connected_q7(const q7_t * pV,
  527. const q7_t * pM,
  528. const uint16_t dim_vec,
  529. const uint16_t num_of_rows,
  530. const uint16_t bias_shift,
  531. const uint16_t out_shift,
  532. const q7_t * bias,
  533. q7_t * pOut,
  534. q15_t * vec_buffer);
  535. /**
  536. * @brief Q7 opt fully-connected layer function
  537. * @param[in] pV pointer to input vector
  538. * @param[in] pM pointer to matrix weights
  539. * @param[in] dim_vec length of the vector
  540. * @param[in] num_of_rows number of rows in weight matrix
  541. * @param[in] bias_shift amount of left-shift for bias
  542. * @param[in] out_shift amount of right-shift for output
  543. * @param[in] bias pointer to bias
  544. * @param[in,out] pOut pointer to output vector
  545. * @param[in,out] vec_buffer pointer to buffer space for input
  546. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  547. *
  548. */
  549. arm_status arm_fully_connected_q7_opt(const q7_t * pV,
  550. const q7_t * pM,
  551. const uint16_t dim_vec,
  552. const uint16_t num_of_rows,
  553. const uint16_t bias_shift,
  554. const uint16_t out_shift,
  555. const q7_t * bias,
  556. q7_t * pOut,
  557. q15_t * vec_buffer);
  558. /**
  559. * @brief Q15 basic fully-connected layer function
  560. * @param[in] pV pointer to input vector
  561. * @param[in] pM pointer to matrix weights
  562. * @param[in] dim_vec length of the vector
  563. * @param[in] num_of_rows number of rows in weight matrix
  564. * @param[in] bias_shift amount of left-shift for bias
  565. * @param[in] out_shift amount of right-shift for output
  566. * @param[in] bias pointer to bias
  567. * @param[in,out] pOut pointer to output vector
  568. * @param[in,out] vec_buffer pointer to buffer space for input
  569. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  570. *
  571. */
  572. arm_status arm_fully_connected_q15(const q15_t * pV,
  573. const q15_t * pM,
  574. const uint16_t dim_vec,
  575. const uint16_t num_of_rows,
  576. const uint16_t bias_shift,
  577. const uint16_t out_shift,
  578. const q15_t * bias,
  579. q15_t * pOut,
  580. q15_t * vec_buffer);
  581. /**
  582. * @brief Q15 opt fully-connected layer function
  583. * @param[in] pV pointer to input vector
  584. * @param[in] pM pointer to matrix weights
  585. * @param[in] dim_vec length of the vector
  586. * @param[in] num_of_rows number of rows in weight matrix
  587. * @param[in] bias_shift amount of left-shift for bias
  588. * @param[in] out_shift amount of right-shift for output
  589. * @param[in] bias pointer to bias
  590. * @param[in,out] pOut pointer to output vector
  591. * @param[in,out] vec_buffer pointer to buffer space for input
  592. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  593. *
  594. */
  595. arm_status arm_fully_connected_q15_opt(const q15_t * pV,
  596. const q15_t * pM,
  597. const uint16_t dim_vec,
  598. const uint16_t num_of_rows,
  599. const uint16_t bias_shift,
  600. const uint16_t out_shift,
  601. const q15_t * bias,
  602. q15_t * pOut,
  603. q15_t * vec_buffer);
  604. /**
  605. * @brief Mixed Q15-Q7 fully-connected layer function
  606. * @param[in] pV pointer to input vector
  607. * @param[in] pM pointer to matrix weights
  608. * @param[in] dim_vec length of the vector
  609. * @param[in] num_of_rows number of rows in weight matrix
  610. * @param[in] bias_shift amount of left-shift for bias
  611. * @param[in] out_shift amount of right-shift for output
  612. * @param[in] bias pointer to bias
  613. * @param[in,out] pOut pointer to output vector
  614. * @param[in,out] vec_buffer pointer to buffer space for input
  615. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  616. *
  617. */
  618. arm_status arm_fully_connected_mat_q7_vec_q15(const q15_t * pV,
  619. const q7_t * pM,
  620. const uint16_t dim_vec,
  621. const uint16_t num_of_rows,
  622. const uint16_t bias_shift,
  623. const uint16_t out_shift,
  624. const q7_t * bias,
  625. q15_t * pOut,
  626. q15_t * vec_buffer);
  627. /**
  628. * @brief Mixed Q15-Q7 opt fully-connected layer function
  629. * @param[in] pV pointer to input vector
  630. * @param[in] pM pointer to matrix weights
  631. * @param[in] dim_vec length of the vector
  632. * @param[in] num_of_rows number of rows in weight matrix
  633. * @param[in] bias_shift amount of left-shift for bias
  634. * @param[in] out_shift amount of right-shift for output
  635. * @param[in] bias pointer to bias
  636. * @param[in,out] pOut pointer to output vector
  637. * @param[in,out] vec_buffer pointer to buffer space for input
  638. * @return The function returns <code>ARM_MATH_SUCCESS</code>
  639. *
  640. */
  641. arm_status arm_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
  642. const q7_t * pM,
  643. const uint16_t dim_vec,
  644. const uint16_t num_of_rows,
  645. const uint16_t bias_shift,
  646. const uint16_t out_shift,
  647. const q7_t * bias,
  648. q15_t * pOut,
  649. q15_t * vec_buffer);
  650. /**
  651. * @brief Matrix-Multiplication Kernels for Convolution
  652. *
  653. * These functions are used within convolution layer functions for
  654. * matrix multiplication.
  655. *
  656. * The implementation is similar to CMSIS-DSP arm_mat_mult functions
  657. * with one Q7 and one Q15 operands. The Q15 operand is the im2col
  658. * output which is always with 2 columns.
  659. *
  660. */
  661. /**
  662. * @brief Matrix-multiplication function for convolution
  663. * @param[in] pA pointer to operand A
  664. * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
  665. * @param[in] ch_im_out numRow of A
  666. * @param[in] numCol_A numCol of A
  667. * @param[in] bias_shift amount of left-shift for bias
  668. * @param[in] out_shift amount of right-shift for output
  669. * @param[in] bias the bias
  670. * @param[in,out] pOut pointer to output
  671. * @return The function returns the incremented output pointer
  672. */
  673. q7_t *arm_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
  674. const q15_t * pInBuffer,
  675. const uint16_t ch_im_out,
  676. const uint16_t numCol_A,
  677. const uint16_t bias_shift,
  678. const uint16_t out_shift,
  679. const q7_t * bias,
  680. q7_t * pOut);
  681. /**
  682. * @brief Matrix-multiplication function for convolution with reordered columns
  683. * @param[in] pA pointer to operand A
  684. * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors
  685. * @param[in] ch_im_out numRow of A
  686. * @param[in] numCol_A numCol of A
  687. * @param[in] bias_shift amount of left-shift for bias
  688. * @param[in] out_shift amount of right-shift for output
  689. * @param[in] bias the bias
  690. * @param[in,out] pOut pointer to output
  691. * @return The function returns the incremented output pointer
  692. */
  693. q7_t *arm_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
  694. const q15_t * pInBuffer,
  695. const uint16_t ch_im_out,
  696. const uint16_t numCol_A,
  697. const uint16_t bias_shift,
  698. const uint16_t out_shift,
  699. const q7_t * bias,
  700. q7_t * pOut);
  701. #ifdef __cplusplus
  702. }
  703. #endif
  704. /*
  705. * Other functions
  706. * These layers are typically not timing critical
  707. * Basic implementation is supported here
  708. */
  709. #ifdef __cplusplus
  710. extern "C"
  711. {
  712. #endif
  713. /**
  714. * @defgroup Acti Neural Network Activation Functions
  715. *
  716. * Perform activation layers, including ReLU (Rectified Linear Unit),
  717. * sigmoid and tanh
  718. *
  719. */
  720. /**
  721. * @brief Q7 RELU function
  722. * @param[in,out] data pointer to input
  723. * @param[in] size number of elements
  724. * @return none.
  725. */
  726. void arm_relu_q7(q7_t * data, uint16_t size);
  727. /**
  728. * @brief Q15 RELU function
  729. * @param[in,out] data pointer to input
  730. * @param[in] size number of elements
  731. * @return none.
  732. */
  733. void arm_relu_q15(q15_t * data, uint16_t size);
  734. /**
  735. * @brief Q7 neural network activation function using direct table look-up
  736. * @param[in,out] data pointer to input
  737. * @param[in] size number of elements
  738. * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
  739. * @param[in] type type of activation functions
  740. * @return none.
  741. */
  742. void arm_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
  743. arm_nn_activation_type type);
  744. /**
  745. * @brief Q15 neural network activation function using direct table look-up
  746. * @param[in,out] data pointer to input
  747. * @param[in] size number of elements
  748. * @param[in] int_width bit-width of the integer part, assume to be smaller than 3
  749. * @param[in] type type of activation functions
  750. * @return none.
  751. */
  752. void arm_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
  753. arm_nn_activation_type type);
  754. /**
  755. * @defgroup Pooling Neural Network Pooling Functions
  756. *
  757. * Perform pooling functions, including max pooling and average pooling
  758. *
  759. */
  760. /**
  761. * @brief Q7 max pooling function
  762. * @param[in] Im_in pointer to input tensor
  763. * @param[in] dim_im_in input tensor dimention
  764. * @param[in] ch_im_in number of input tensor channels
  765. * @param[in] dim_kernel filter kernel size
  766. * @param[in] padding padding sizes
  767. * @param[in] stride convolution stride
  768. * @param[in] dim_im_out output tensor dimension
  769. * @param[in,out] bufferA pointer to buffer space for input
  770. * @param[in,out] Im_out pointer to output tensor
  771. * @return none.
  772. *
  773. */
  774. void arm_maxpool_q7_HWC(q7_t * Im_in,
  775. const uint16_t dim_im_in,
  776. const uint16_t ch_im_in,
  777. const uint16_t dim_kernel,
  778. const uint16_t padding,
  779. const uint16_t stride,
  780. const uint16_t dim_im_out,
  781. q7_t * bufferA,
  782. q7_t * Im_out);
  783. /**
  784. * @brief Q7 average pooling function
  785. * @param[in] Im_in pointer to input tensor
  786. * @param[in] dim_im_in input tensor dimention
  787. * @param[in] ch_im_in number of input tensor channels
  788. * @param[in] dim_kernel filter kernel size
  789. * @param[in] padding padding sizes
  790. * @param[in] stride convolution stride
  791. * @param[in] dim_im_out output tensor dimension
  792. * @param[in,out] bufferA pointer to buffer space for input
  793. * @param[in,out] Im_out pointer to output tensor
  794. * @return none.
  795. *
  796. */
  797. void arm_avepool_q7_HWC(q7_t * Im_in,
  798. const uint16_t dim_im_in,
  799. const uint16_t ch_im_in,
  800. const uint16_t dim_kernel,
  801. const uint16_t padding,
  802. const uint16_t stride,
  803. const uint16_t dim_im_out,
  804. q7_t * bufferA,
  805. q7_t * Im_out);
  806. /**
  807. * @defgroup Softmax Softmax Functions
  808. *
  809. * EXP(2) based softmax function
  810. *
  811. */
  812. /**
  813. * @brief Q7 softmax function
  814. * @param[in] vec_in pointer to input vector
  815. * @param[in] dim_vec input vector dimention
  816. * @param[out] p_out pointer to output vector
  817. * @return none.
  818. *
  819. */
  820. void arm_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
  821. /**
  822. * @brief Q15 softmax function
  823. * @param[in] vec_in pointer to input vector
  824. * @param[in] dim_vec input vector dimention
  825. * @param[out] p_out pointer to output vector
  826. * @return none.
  827. *
  828. */
  829. void arm_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
  830. #ifdef __cplusplus
  831. }
  832. #endif
  833. #endif