fixedpoint_neon.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. // fixedpoint_neon.h: optimized NEON specializations of the templates
  15. // in fixedpoint.h.
  16. #ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
  17. #define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
  18. #include <arm_neon.h>
  19. namespace gemmlowp {
  20. template <>
  21. struct FixedPointRawTypeTraits<int32x4_t> {
  22. typedef std::int32_t ScalarRawType;
  23. static constexpr int kLanes = 4;
  24. };
  25. template <>
  26. struct FixedPointRawTypeTraits<int16x8_t> {
  27. typedef std::int16_t ScalarRawType;
  28. static constexpr int kLanes = 8;
  29. };
  30. template <>
  31. inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
  32. return vandq_s32(a, b);
  33. }
  34. template <>
  35. inline int16x8_t BitAnd(int16x8_t a, int16x8_t b) {
  36. return vandq_s16(a, b);
  37. }
  38. template <>
  39. inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
  40. return vorrq_s32(a, b);
  41. }
  42. template <>
  43. inline int16x8_t BitOr(int16x8_t a, int16x8_t b) {
  44. return vorrq_s16(a, b);
  45. }
  46. template <>
  47. inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
  48. return veorq_s32(a, b);
  49. }
  50. template <>
  51. inline int16x8_t BitXor(int16x8_t a, int16x8_t b) {
  52. return veorq_s16(a, b);
  53. }
  54. template <>
  55. inline int32x4_t BitNot(int32x4_t a) {
  56. return veorq_s32(a, vdupq_n_s32(-1));
  57. }
  58. template <>
  59. inline int16x8_t BitNot(int16x8_t a) {
  60. return veorq_s16(a, vdupq_n_s16(-1));
  61. }
  62. template <>
  63. inline int32x4_t Add(int32x4_t a, int32x4_t b) {
  64. return vaddq_s32(a, b);
  65. }
  66. template <>
  67. inline int16x8_t Add(int16x8_t a, int16x8_t b) {
  68. return vaddq_s16(a, b);
  69. }
  70. template <>
  71. inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
  72. return vsubq_s32(a, b);
  73. }
  74. template <>
  75. inline int16x8_t Sub(int16x8_t a, int16x8_t b) {
  76. return vsubq_s16(a, b);
  77. }
  78. template <>
  79. inline int32x4_t Neg(int32x4_t a) {
  80. return vnegq_s32(a);
  81. }
  82. template <>
  83. inline int16x8_t Neg(int16x8_t a) {
  84. return vnegq_s16(a);
  85. }
  86. template <>
  87. inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
  88. return vshlq_s32(a, vdupq_n_s32(offset));
  89. }
  90. template <>
  91. inline int16x8_t ShiftLeft(int16x8_t a, int offset) {
  92. return vshlq_s16(a, vdupq_n_s16(offset));
  93. }
  94. template <>
  95. inline int32x4_t ShiftLeft(int32x4_t a, int32x4_t offset) {
  96. return vshlq_s32(a, offset);
  97. }
  98. template <>
  99. inline int16x8_t ShiftLeft(int16x8_t a, int16x8_t offset) {
  100. return vshlq_s16(a, offset);
  101. }
  102. template <>
  103. inline int32x4_t ShiftRight(int32x4_t a, int offset) {
  104. return vshlq_s32(a, vdupq_n_s32(-offset));
  105. }
  106. template <>
  107. inline int16x8_t ShiftRight(int16x8_t a, int offset) {
  108. return vshlq_s16(a, vdupq_n_s16(-offset));
  109. }
  110. template <>
  111. inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
  112. int32x4_t else_val) {
  113. return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
  114. }
  115. template <>
  116. inline int16x8_t SelectUsingMask(int16x8_t if_mask, int16x8_t then_val,
  117. int16x8_t else_val) {
  118. return vbslq_s16(vreinterpretq_u16_s16(if_mask), then_val, else_val);
  119. }
  120. template <>
  121. inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
  122. return vreinterpretq_s32_u32(vceqq_s32(a, b));
  123. }
  124. template <>
  125. inline int16x8_t MaskIfEqual(int16x8_t a, int16x8_t b) {
  126. return vreinterpretq_s16_u16(vceqq_s16(a, b));
  127. }
  128. template <>
  129. inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
  130. return BitNot(MaskIfEqual(a, b));
  131. }
  132. template <>
  133. inline int16x8_t MaskIfNotEqual(int16x8_t a, int16x8_t b) {
  134. return BitNot(MaskIfEqual(a, b));
  135. }
  136. template <>
  137. inline int32x4_t MaskIfZero(int32x4_t a) {
  138. return MaskIfEqual(a, vdupq_n_s32(0));
  139. }
  140. template <>
  141. inline int16x8_t MaskIfZero(int16x8_t a) {
  142. return MaskIfEqual(a, vdupq_n_s16(0));
  143. }
  144. template <>
  145. inline int32x4_t MaskIfNonZero(int32x4_t a) {
  146. return vreinterpretq_s32_u32(vtstq_s32(a, a));
  147. }
  148. template <>
  149. inline int16x8_t MaskIfNonZero(int16x8_t a) {
  150. return vreinterpretq_s16_u16(vtstq_s16(a, a));
  151. }
  152. template <>
  153. inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
  154. return vreinterpretq_s32_u32(vcgtq_s32(a, b));
  155. }
  156. template <>
  157. inline int16x8_t MaskIfGreaterThan(int16x8_t a, int16x8_t b) {
  158. return vreinterpretq_s16_u16(vcgtq_s16(a, b));
  159. }
  160. template <>
  161. inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
  162. return vreinterpretq_s32_u32(vcgeq_s32(a, b));
  163. }
  164. template <>
  165. inline int16x8_t MaskIfGreaterThanOrEqual(int16x8_t a, int16x8_t b) {
  166. return vreinterpretq_s16_u16(vcgeq_s16(a, b));
  167. }
  168. template <>
  169. inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
  170. return vreinterpretq_s32_u32(vcltq_s32(a, b));
  171. }
  172. template <>
  173. inline int16x8_t MaskIfLessThan(int16x8_t a, int16x8_t b) {
  174. return vreinterpretq_s16_u16(vcltq_s16(a, b));
  175. }
  176. template <>
  177. inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
  178. return vreinterpretq_s32_u32(vcleq_s32(a, b));
  179. }
  180. template <>
  181. inline int16x8_t MaskIfLessThanOrEqual(int16x8_t a, int16x8_t b) {
  182. return vreinterpretq_s16_u16(vcleq_s16(a, b));
  183. }
  184. template <>
  185. inline bool All(int32x4_t a) {
  186. a = vandq_s32(a, vextq_s32(a, a, 1));
  187. a = vandq_s32(a, vextq_s32(a, a, 2));
  188. return vgetq_lane_s32(a, 0);
  189. }
  190. template <>
  191. inline bool All(int16x8_t a) {
  192. a = vandq_s16(a, vextq_s16(a, a, 1));
  193. a = vandq_s16(a, vextq_s16(a, a, 2));
  194. a = vandq_s16(a, vextq_s16(a, a, 4));
  195. return vgetq_lane_s16(a, 0);
  196. }
  197. template <>
  198. inline bool Any(int32x4_t a) {
  199. a = vorrq_s32(a, vextq_s32(a, a, 1));
  200. a = vorrq_s32(a, vextq_s32(a, a, 2));
  201. return vgetq_lane_s32(a, 0);
  202. }
  203. template <>
  204. inline bool Any(int16x8_t a) {
  205. a = vorrq_s16(a, vextq_s16(a, a, 1));
  206. a = vorrq_s16(a, vextq_s16(a, a, 2));
  207. a = vorrq_s16(a, vextq_s16(a, a, 4));
  208. return vgetq_lane_s16(a, 0);
  209. }
  210. template <>
  211. inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
  212. return vrhaddq_s32(a, b);
  213. }
  214. template <>
  215. inline int16x8_t RoundingHalfSum(int16x8_t a, int16x8_t b) {
  216. return vrhaddq_s16(a, b);
  217. }
  218. template <>
  219. inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
  220. return vqrdmulhq_s32(a, b);
  221. }
  222. template <>
  223. inline int16x8_t SaturatingRoundingDoublingHighMul(int16x8_t a, int16x8_t b) {
  224. return vqrdmulhq_s16(a, b);
  225. }
  226. template <>
  227. inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
  228. const int32x4_t shift_vec = vdupq_n_s32(-exponent);
  229. const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
  230. const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
  231. return vrshlq_s32(fixed_up_x, shift_vec);
  232. }
  233. template <>
  234. inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) {
  235. const int16x8_t shift_vec = vdupq_n_s16(-exponent);
  236. const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
  237. const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
  238. return vrshlq_s16(fixed_up_x, shift_vec);
  239. }
  240. template <>
  241. inline int32x4_t RoundingDivideByPOT(int32x4_t x, int32x4_t exponent) {
  242. const int32x4_t shift_vec = vnegq_s32(exponent);
  243. const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
  244. const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
  245. return vrshlq_s32(fixed_up_x, shift_vec);
  246. }
  247. template <>
  248. inline int16x8_t RoundingDivideByPOT(int16x8_t x, int16x8_t exponent) {
  249. const int16x8_t shift_vec = vnegq_s16(exponent);
  250. const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
  251. const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
  252. return vrshlq_s16(fixed_up_x, shift_vec);
  253. }
  254. template <int Exponent>
  255. struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
  256. static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
  257. };
  258. template <int Exponent>
  259. struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
  260. static int32x4_t eval(int32x4_t x) {
  261. const int32x4_t fixup = vshrq_n_s32(x, 31);
  262. const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
  263. return vrshrq_n_s32(fixed_up_x, -Exponent);
  264. }
  265. };
  266. template <int Exponent>
  267. struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, 1> {
  268. static int16x8_t eval(int16x8_t x) { return vqshlq_n_s16(x, Exponent); }
  269. };
  270. template <int Exponent>
  271. struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, -1> {
  272. static int16x8_t eval(int16x8_t x) {
  273. const int16x8_t fixup = vshrq_n_s16(x, 15);
  274. const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
  275. return vrshrq_n_s16(fixed_up_x, -Exponent);
  276. }
  277. };
  278. template <>
  279. inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
  280. return vdupq_n_s32(x);
  281. }
  282. template <>
  283. inline int16x8_t Dup<int16x8_t>(std::int16_t x) {
  284. return vdupq_n_s16(x);
  285. }
  286. // So far this is only needed for int16.
  287. template <>
  288. inline int16x8_t SaturatingAdd(int16x8_t a, int16x8_t b) {
  289. return vqaddq_s16(a, b);
  290. }
  291. } // end namespace gemmlowp
  292. #endif // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_