arm_nn_compiler.h 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315
  1. /*
  2. * SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
  3. *
  4. * SPDX-License-Identifier: Apache-2.0
  5. *
  6. * Licensed under the Apache License, Version 2.0 (the License); you may
  7. * not use this file except in compliance with the License.
  8. * You may obtain a copy of the License at
  9. *
  10. * www.apache.org/licenses/LICENSE-2.0
  11. *
  12. * Unless required by applicable law or agreed to in writing, software
  13. * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  14. * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15. * See the License for the specific language governing permissions and
  16. * limitations under the License.
  17. */
  18. /* ----------------------------------------------------------------------
  19. * Project: CMSIS NN Library
  20. * Title: arm_nn_compiler.h
  21. * Description: Generic compiler header
  22. *
  23. * $Date: 16 January 2024
  24. * $Revision: V.1.2.2
  25. *
  26. * Target : Arm(R) M-Profile Architecture
  27. * -------------------------------------------------------------------- */
  28. #ifndef ARM_NN_COMPILER_H
  29. #define ARM_NN_COMPILER_H
  30. /**
  31. *
  32. * @brief Arm C-Language Extension(ACLE) Includes
  33. *
  34. */
  35. #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050)
  36. #ifndef __ASM
  37. #define __ASM __asm
  38. #endif
  39. #ifndef __INLINE
  40. #define __INLINE __inline
  41. #endif
  42. #ifndef __STATIC_INLINE
  43. #define __STATIC_INLINE static __inline
  44. #endif
  45. #ifndef __STATIC_FORCEINLINE
  46. #define __STATIC_FORCEINLINE __attribute__((always_inline)) static __inline
  47. #endif
  48. #ifndef __RESTRICT
  49. #define __RESTRICT __restrict
  50. #endif
  51. #elif defined(__ICCARM__)
  52. #warning IAR support is not tested
  53. #ifndef __ASM
  54. #define __ASM __asm
  55. #endif
  56. #ifndef __INLINE
  57. #define __INLINE inline
  58. #endif
  59. #ifndef __STATIC_INLINE
  60. #define __STATIC_INLINE static inline
  61. #endif
  62. #ifndef __FORCEINLINE
  63. #define __FORCEINLINE _Pragma("inline=forced")
  64. #endif
  65. #ifndef __STATIC_FORCEINLINE
  66. #define __STATIC_FORCEINLINE __FORCEINLINE __STATIC_INLINE
  67. #endif
  68. #ifndef __RESTRICT
  69. #define __RESTRICT __restrict
  70. #endif
  71. #elif defined(_MSC_VER)
  72. // Build for non Arm Cortex-M processors is not tested or supported.
  73. // Use this section to stub any macros or intrinsics
  74. #warning Unsupported compiler
  75. #ifndef __STATIC_FORCEINLINE
  76. #define __STATIC_FORCEINLINE static __forceinline
  77. #endif
  78. #ifndef __STATIC_INLINE
  79. #define __STATIC_INLINE static __inline
  80. #endif
  81. #ifndef __ALIGNED
  82. #define __ALIGNED(x) __declspec(align(x))
  83. #endif
  84. #elif defined(__GNUC__)
  85. #ifndef __ASM
  86. #define __ASM __asm
  87. #endif
  88. #ifndef __INLINE
  89. #define __INLINE inline
  90. #endif
  91. #ifndef __STATIC_INLINE
  92. #define __STATIC_INLINE static inline
  93. #endif
  94. #ifndef __STATIC_FORCEINLINE
  95. #define __STATIC_FORCEINLINE __attribute__((always_inline)) static inline
  96. #endif
  97. #ifndef __RESTRICT
  98. #define __RESTRICT __restrict
  99. #endif
  100. #else
  101. #error Unsupported compiler. Add support as needed
  102. #endif
  103. /**
  104. *
  105. * @brief Compiler specific diagnostic adjustment / fixes if applicable
  106. *
  107. */
  108. // Note: __ARM_ARCH is used with M-profile architecture as the target here.
  109. #if defined(__GNUC__)
  110. #if (__GNUC__ == 12 && (__GNUC_MINOR__ <= 2)) && defined(__ARM_ARCH)
  111. // Workaround for 'Internal Compiler Error' on Arm GNU Toolchain rel 12.2.x
  112. // https://gcc.gnu.org/pipermail/gcc-patches/2022-December/607963.html
  113. #define ARM_GCC_12_2_ICE
  114. #endif
  115. #endif
  116. #if defined(__ARM_FEATURE_MVE) && ((__ARM_FEATURE_MVE & 3) == 3) || (__ARM_FEATURE_MVE & 1)
  117. #include <arm_mve.h>
  118. #endif
  119. #if defined(__ARM_ARCH) || defined(__ARM_ACLE)
  120. #include <arm_acle.h>
  121. #endif
  122. #if defined(__GNUC__)
  123. #include <stdint.h>
  124. #endif
  125. /**
  126. *
  127. * @brief ACLE and Intrinsics
  128. *
  129. */
  130. // Note: Have __GNUC__, that is used to check for GCC , checks at the end
  131. // as __GNUC__ is defined by non-GCC compilers as well
  132. /* Common intrinsics for all architectures */
  133. #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
  134. #define CLZ __clz
  135. #elif defined(__GNUC__)
  136. /**
  137. \brief Count leading zeros
  138. \details Counts the number of leading zeros of a data value.
  139. \param [in] value Value to count the leading zeros
  140. \return number of leading zeros in value
  141. */
  142. __STATIC_FORCEINLINE uint8_t CLZ(uint32_t value)
  143. {
  144. /* Even though __builtin_clz produces a CLZ instruction on ARM, formally
  145. __builtin_clz(0) is undefined behaviour, so handle this case specially.
  146. This guarantees Arm-compatible results if compiling on a non-Arm
  147. target, and ensures the compiler doesn't decide to activate any
  148. optimisations using the logic "value was passed to __builtin_clz, so it
  149. is non-zero".
  150. ARM GCC 7.3 and possibly earlier will optimise this test away, leaving a
  151. single CLZ instruction.
  152. */
  153. if (value == 0U)
  154. {
  155. return 32U;
  156. }
  157. return __builtin_clz(value);
  158. }
  159. #endif
  160. // ACLE intrinsics under groups __ARM_FEATURE_QBIT, __ARM_FEATURE_DSP , __ARM_FEATURE_SAT, __ARM_FEATURE_SIMD32
  161. // Note: Just __ARM_FEATURE_DSP is checked to collect all intrinsics from the above mentioned groups
  162. #if (defined(__ARM_FEATURE_DSP) && (__ARM_FEATURE_DSP == 1))
  163. // Common intrinsics
  164. #define SMLABB __smlabb
  165. #define SMLATT __smlatt
  166. #define QADD __qadd
  167. #define QSUB8 __qsub8
  168. #define QSUB16 __qsub16
  169. #define SADD16 __sadd16
  170. // Compiler specifc variants of intrinsics. Create a new section or file for IAR if needed
  171. #if defined(__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) || defined(__ICCARM__)
  172. #define SMULBB __smulbb
  173. #define SMULTT __smultt
  174. #define ROR __ror
  175. #define SXTB16 __sxtb16
  176. #define SXTAB16 __sxtab16
  177. #define SXTB16_RORn(ARG1, ARG2) SXTB16(ROR(ARG1, ARG2))
  178. #define SXTAB16_RORn(ARG1, ARG2, ARG3) SXTAB16(ARG1, ROR(ARG2, ARG3))
  179. #define SMLAD __smlad
  180. // PKH<XY> translates into pkh<xy> on AC6
  181. #define PKHBT(ARG1, ARG2, ARG3) \
  182. (((((uint32_t)(ARG1))) & 0x0000FFFFUL) | ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL))
  183. #define PKHTB(ARG1, ARG2, ARG3) \
  184. (((((uint32_t)(ARG1))) & 0xFFFF0000UL) | ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL))
  185. #elif defined(__GNUC__)
  186. #define PKHBT(ARG1, ARG2, ARG3) \
  187. __extension__({ \
  188. uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
  189. __ASM("pkhbt %0, %1, %2, lsl %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \
  190. __RES; \
  191. })
  192. #define PKHTB(ARG1, ARG2, ARG3) \
  193. __extension__({ \
  194. uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
  195. if (ARG3 == 0) \
  196. __ASM("pkhtb %0, %1, %2" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2)); \
  197. else \
  198. __ASM("pkhtb %0, %1, %2, asr %3" : "=r"(__RES) : "r"(__ARG1), "r"(__ARG2), "I"(ARG3)); \
  199. __RES; \
  200. })
  201. __STATIC_FORCEINLINE uint32_t SXTAB16(uint32_t op1, uint32_t op2)
  202. {
  203. uint32_t result;
  204. __ASM("sxtab16 %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
  205. return (result);
  206. }
  207. __STATIC_FORCEINLINE uint32_t SXTB16(uint32_t op1)
  208. {
  209. uint32_t result;
  210. __ASM("sxtb16 %0, %1" : "=r"(result) : "r"(op1));
  211. return (result);
  212. }
  213. // __smlad is defined by GCC, but results in a performance drop(Tested on Arm GNU Toolchain version 11.x and 12.x)
  214. __STATIC_FORCEINLINE uint32_t SMLAD(uint32_t op1, uint32_t op2, uint32_t op3)
  215. {
  216. uint32_t result;
  217. __ASM volatile("smlad %0, %1, %2, %3" : "=r"(result) : "r"(op1), "r"(op2), "r"(op3));
  218. return (result);
  219. }
  220. __STATIC_FORCEINLINE uint32_t ROR(uint32_t op1, uint32_t op2)
  221. {
  222. op2 %= 32U;
  223. if (op2 == 0U)
  224. {
  225. return op1;
  226. }
  227. return (op1 >> op2) | (op1 << (32U - op2));
  228. }
  229. __STATIC_FORCEINLINE uint32_t SXTB16_RORn(uint32_t op1, uint32_t rotate)
  230. {
  231. uint32_t result;
  232. if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
  233. {
  234. __ASM volatile("sxtb16 %0, %1, ROR %2" : "=r"(result) : "r"(op1), "i"(rotate));
  235. }
  236. else
  237. {
  238. result = SXTB16(ROR(op1, rotate));
  239. }
  240. return result;
  241. }
  242. __STATIC_FORCEINLINE uint32_t SXTAB16_RORn(uint32_t op1, uint32_t op2, uint32_t rotate)
  243. {
  244. uint32_t result;
  245. if (__builtin_constant_p(rotate) && ((rotate == 8U) || (rotate == 16U) || (rotate == 24U)))
  246. {
  247. __ASM volatile("sxtab16 %0, %1, %2, ROR %3" : "=r"(result) : "r"(op1), "r"(op2), "i"(rotate));
  248. }
  249. else
  250. {
  251. result = SXTAB16(op1, ROR(op2, rotate));
  252. }
  253. return result;
  254. }
  255. // Inline assembly routines for ACLE intrinsics that are not defined by GCC toolchain
  256. __STATIC_FORCEINLINE uint32_t SMULBB(uint32_t op1, uint32_t op2)
  257. {
  258. uint32_t result;
  259. __ASM volatile("smulbb %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
  260. return (result);
  261. }
  262. __STATIC_FORCEINLINE uint32_t SMULTT(uint32_t op1, uint32_t op2)
  263. {
  264. uint32_t result;
  265. __ASM volatile("smultt %0, %1, %2" : "=r"(result) : "r"(op1), "r"(op2));
  266. return (result);
  267. }
  268. #endif
  269. #endif
  270. #endif /* #ifndef ARM_NN_COMPILER_H */