simd_neon.h 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. // Simd NEON specific implementations -*- C++ -*-
  2. // Copyright (C) 2020-2021 Free Software Foundation, Inc.
  3. //
  4. // This file is part of the GNU ISO C++ Library. This library is free
  5. // software; you can redistribute it and/or modify it under the
  6. // terms of the GNU General Public License as published by the
  7. // Free Software Foundation; either version 3, or (at your option)
  8. // any later version.
  9. // This library is distributed in the hope that it will be useful,
  10. // but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. // GNU General Public License for more details.
  13. // Under Section 7 of GPL version 3, you are granted additional
  14. // permissions described in the GCC Runtime Library Exception, version
  15. // 3.1, as published by the Free Software Foundation.
  16. // You should have received a copy of the GNU General Public License and
  17. // a copy of the GCC Runtime Library Exception along with this program;
  18. // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
  19. // <http://www.gnu.org/licenses/>.
  20. #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
  21. #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
  22. #if __cplusplus >= 201703L
  23. #if !_GLIBCXX_SIMD_HAVE_NEON
  24. #error "simd_neon.h may only be included when NEON on ARM is available"
  25. #endif
  26. _GLIBCXX_SIMD_BEGIN_NAMESPACE
  27. // _CommonImplNeon {{{
  28. struct _CommonImplNeon : _CommonImplBuiltin
  29. {
  30. // _S_store {{{
  31. using _CommonImplBuiltin::_S_store;
  32. // }}}
  33. };
  34. // }}}
  35. // _SimdImplNeon {{{
  36. template <typename _Abi>
  37. struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
  38. {
  39. using _Base = _SimdImplBuiltin<_Abi>;
  40. template <typename _Tp>
  41. using _MaskMember = typename _Base::template _MaskMember<_Tp>;
  42. template <typename _Tp>
  43. static constexpr size_t _S_max_store_size = 16;
  44. // _S_masked_load {{{
  45. template <typename _Tp, size_t _Np, typename _Up>
  46. static inline _SimdWrapper<_Tp, _Np>
  47. _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
  48. const _Up* __mem) noexcept
  49. {
  50. __execute_n_times<_Np>([&](auto __i) {
  51. if (__k[__i] != 0)
  52. __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
  53. });
  54. return __merge;
  55. }
  56. // }}}
  57. // _S_masked_store_nocvt {{{
  58. template <typename _Tp, size_t _Np>
  59. _GLIBCXX_SIMD_INTRINSIC static void
  60. _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
  61. _MaskMember<_Tp> __k)
  62. {
  63. __execute_n_times<_Np>([&](auto __i) {
  64. if (__k[__i] != 0)
  65. __mem[__i] = __v[__i];
  66. });
  67. }
  68. // }}}
  69. // _S_reduce {{{
  70. template <typename _Tp, typename _BinaryOperation>
  71. _GLIBCXX_SIMD_INTRINSIC static _Tp
  72. _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
  73. {
  74. constexpr size_t _Np = __x.size();
  75. if constexpr (sizeof(__x) == 16 && _Np >= 4
  76. && !_Abi::template _S_is_partial<_Tp>)
  77. {
  78. const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
  79. const auto __y = __binary_op(__halves[0], __halves[1]);
  80. return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
  81. __y, static_cast<_BinaryOperation&&>(__binary_op));
  82. }
  83. else if constexpr (_Np == 8)
  84. {
  85. __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
  86. __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
  87. __x._M_data)));
  88. __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
  89. __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
  90. __x._M_data)));
  91. __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
  92. __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
  93. __x._M_data)));
  94. return __x[0];
  95. }
  96. else if constexpr (_Np == 4)
  97. {
  98. __x
  99. = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
  100. __vector_permute<1, 0, 3, 2>(__x._M_data)));
  101. __x
  102. = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
  103. __vector_permute<3, 2, 1, 0>(__x._M_data)));
  104. return __x[0];
  105. }
  106. else if constexpr (_Np == 2)
  107. {
  108. __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
  109. __vector_permute<1, 0>(__x._M_data)));
  110. return __x[0];
  111. }
  112. else
  113. return _Base::_S_reduce(__x,
  114. static_cast<_BinaryOperation&&>(__binary_op));
  115. }
  116. // }}}
  117. // math {{{
  118. // _S_sqrt {{{
  119. template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
  120. _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
  121. {
  122. if constexpr (__have_neon_a64)
  123. {
  124. const auto __intrin = __to_intrin(__x);
  125. if constexpr (_TVT::template _S_is<float, 2>)
  126. return vsqrt_f32(__intrin);
  127. else if constexpr (_TVT::template _S_is<float, 4>)
  128. return vsqrtq_f32(__intrin);
  129. else if constexpr (_TVT::template _S_is<double, 1>)
  130. return vsqrt_f64(__intrin);
  131. else if constexpr (_TVT::template _S_is<double, 2>)
  132. return vsqrtq_f64(__intrin);
  133. else
  134. __assert_unreachable<_Tp>();
  135. }
  136. else
  137. return _Base::_S_sqrt(__x);
  138. }
  139. // }}}
  140. // _S_trunc {{{
  141. template <typename _TW, typename _TVT = _VectorTraits<_TW>>
  142. _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
  143. {
  144. using _Tp = typename _TVT::value_type;
  145. if constexpr (__have_neon_a32)
  146. {
  147. const auto __intrin = __to_intrin(__x);
  148. if constexpr (_TVT::template _S_is<float, 2>)
  149. return vrnd_f32(__intrin);
  150. else if constexpr (_TVT::template _S_is<float, 4>)
  151. return vrndq_f32(__intrin);
  152. else if constexpr (_TVT::template _S_is<double, 1>)
  153. return vrnd_f64(__intrin);
  154. else if constexpr (_TVT::template _S_is<double, 2>)
  155. return vrndq_f64(__intrin);
  156. else
  157. __assert_unreachable<_Tp>();
  158. }
  159. else if constexpr (is_same_v<_Tp, float>)
  160. {
  161. auto __intrin = __to_intrin(__x);
  162. if constexpr (sizeof(__x) == 16)
  163. __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
  164. else
  165. __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
  166. return _Base::_S_abs(__x)._M_data < 0x1p23f
  167. ? __vector_bitcast<float>(__intrin)
  168. : __x._M_data;
  169. }
  170. else
  171. return _Base::_S_trunc(__x);
  172. }
  173. // }}}
  174. // _S_round {{{
  175. template <typename _Tp, size_t _Np>
  176. _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
  177. _S_round(_SimdWrapper<_Tp, _Np> __x)
  178. {
  179. if constexpr (__have_neon_a32)
  180. {
  181. const auto __intrin = __to_intrin(__x);
  182. if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
  183. return vrnda_f32(__intrin);
  184. else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
  185. return vrndaq_f32(__intrin);
  186. else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
  187. return vrnda_f64(__intrin);
  188. else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
  189. return vrndaq_f64(__intrin);
  190. else
  191. __assert_unreachable<_Tp>();
  192. }
  193. else
  194. return _Base::_S_round(__x);
  195. }
  196. // }}}
  197. // _S_floor {{{
  198. template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
  199. _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
  200. {
  201. if constexpr (__have_neon_a32)
  202. {
  203. const auto __intrin = __to_intrin(__x);
  204. if constexpr (_TVT::template _S_is<float, 2>)
  205. return vrndm_f32(__intrin);
  206. else if constexpr (_TVT::template _S_is<float, 4>)
  207. return vrndmq_f32(__intrin);
  208. else if constexpr (_TVT::template _S_is<double, 1>)
  209. return vrndm_f64(__intrin);
  210. else if constexpr (_TVT::template _S_is<double, 2>)
  211. return vrndmq_f64(__intrin);
  212. else
  213. __assert_unreachable<_Tp>();
  214. }
  215. else
  216. return _Base::_S_floor(__x);
  217. }
  218. // }}}
  219. // _S_ceil {{{
  220. template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
  221. _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
  222. {
  223. if constexpr (__have_neon_a32)
  224. {
  225. const auto __intrin = __to_intrin(__x);
  226. if constexpr (_TVT::template _S_is<float, 2>)
  227. return vrndp_f32(__intrin);
  228. else if constexpr (_TVT::template _S_is<float, 4>)
  229. return vrndpq_f32(__intrin);
  230. else if constexpr (_TVT::template _S_is<double, 1>)
  231. return vrndp_f64(__intrin);
  232. else if constexpr (_TVT::template _S_is<double, 2>)
  233. return vrndpq_f64(__intrin);
  234. else
  235. __assert_unreachable<_Tp>();
  236. }
  237. else
  238. return _Base::_S_ceil(__x);
  239. }
  240. //}}} }}}
  241. }; // }}}
  242. // _MaskImplNeonMixin {{{
  243. struct _MaskImplNeonMixin
  244. {
  245. using _Base = _MaskImplBuiltinMixin;
  246. template <typename _Tp, size_t _Np>
  247. _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
  248. _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
  249. {
  250. if (__builtin_is_constant_evaluated())
  251. return _Base::_S_to_bits(__x);
  252. using _I = __int_for_sizeof_t<_Tp>;
  253. if constexpr (sizeof(__x) == 16)
  254. {
  255. auto __asint = __vector_bitcast<_I>(__x);
  256. #ifdef __aarch64__
  257. [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
  258. #else
  259. [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
  260. #endif
  261. if constexpr (sizeof(_Tp) == 1)
  262. {
  263. constexpr auto __bitsel
  264. = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
  265. [&](auto __i) {
  266. return static_cast<_I>(
  267. __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
  268. });
  269. __asint &= __bitsel;
  270. #ifdef __aarch64__
  271. return __vector_bitcast<_UShort>(
  272. vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
  273. __zero))[0];
  274. #else
  275. return __vector_bitcast<_UShort>(
  276. vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
  277. __zero),
  278. __zero))[0];
  279. #endif
  280. }
  281. else if constexpr (sizeof(_Tp) == 2)
  282. {
  283. constexpr auto __bitsel
  284. = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
  285. [&](auto __i) {
  286. return static_cast<_I>(__i < _Np ? 1 << __i : 0);
  287. });
  288. __asint &= __bitsel;
  289. #ifdef __aarch64__
  290. return vaddvq_s16(__asint);
  291. #else
  292. return vpadd_s16(
  293. vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
  294. __zero)[0];
  295. #endif
  296. }
  297. else if constexpr (sizeof(_Tp) == 4)
  298. {
  299. constexpr auto __bitsel
  300. = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
  301. [&](auto __i) {
  302. return static_cast<_I>(__i < _Np ? 1 << __i : 0);
  303. });
  304. __asint &= __bitsel;
  305. #ifdef __aarch64__
  306. return vaddvq_s32(__asint);
  307. #else
  308. return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
  309. __zero)[0];
  310. #endif
  311. }
  312. else if constexpr (sizeof(_Tp) == 8)
  313. return (__asint[0] & 1) | (__asint[1] & 2);
  314. else
  315. __assert_unreachable<_Tp>();
  316. }
  317. else if constexpr (sizeof(__x) == 8)
  318. {
  319. auto __asint = __vector_bitcast<_I>(__x);
  320. [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
  321. if constexpr (sizeof(_Tp) == 1)
  322. {
  323. constexpr auto __bitsel
  324. = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
  325. [&](auto __i) {
  326. return static_cast<_I>(__i < _Np ? 1 << __i : 0);
  327. });
  328. __asint &= __bitsel;
  329. #ifdef __aarch64__
  330. return vaddv_s8(__asint);
  331. #else
  332. return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
  333. __zero)[0];
  334. #endif
  335. }
  336. else if constexpr (sizeof(_Tp) == 2)
  337. {
  338. constexpr auto __bitsel
  339. = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
  340. [&](auto __i) {
  341. return static_cast<_I>(__i < _Np ? 1 << __i : 0);
  342. });
  343. __asint &= __bitsel;
  344. #ifdef __aarch64__
  345. return vaddv_s16(__asint);
  346. #else
  347. return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
  348. #endif
  349. }
  350. else if constexpr (sizeof(_Tp) == 4)
  351. {
  352. __asint &= __make_vector<_I>(0x1, 0x2);
  353. #ifdef __aarch64__
  354. return vaddv_s32(__asint);
  355. #else
  356. return vpadd_s32(__asint, __zero)[0];
  357. #endif
  358. }
  359. else
  360. __assert_unreachable<_Tp>();
  361. }
  362. else
  363. return _Base::_S_to_bits(__x);
  364. }
  365. };
  366. // }}}
  367. // _MaskImplNeon {{{
  368. template <typename _Abi>
  369. struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
  370. {
  371. using _MaskImplBuiltinMixin::_S_to_maskvector;
  372. using _MaskImplNeonMixin::_S_to_bits;
  373. using _Base = _MaskImplBuiltin<_Abi>;
  374. using _Base::_S_convert;
  375. // _S_all_of {{{
  376. template <typename _Tp>
  377. _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
  378. {
  379. const auto __kk
  380. = __vector_bitcast<char>(__k._M_data)
  381. | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
  382. if constexpr (sizeof(__k) == 16)
  383. {
  384. const auto __x = __vector_bitcast<long long>(__kk);
  385. return __x[0] + __x[1] == -2;
  386. }
  387. else if constexpr (sizeof(__k) <= 8)
  388. return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
  389. else
  390. __assert_unreachable<_Tp>();
  391. }
  392. // }}}
  393. // _S_any_of {{{
  394. template <typename _Tp>
  395. _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
  396. {
  397. const auto __kk
  398. = __vector_bitcast<char>(__k._M_data)
  399. | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
  400. if constexpr (sizeof(__k) == 16)
  401. {
  402. const auto __x = __vector_bitcast<long long>(__kk);
  403. return (__x[0] | __x[1]) != 0;
  404. }
  405. else if constexpr (sizeof(__k) <= 8)
  406. return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
  407. else
  408. __assert_unreachable<_Tp>();
  409. }
  410. // }}}
  411. // _S_none_of {{{
  412. template <typename _Tp>
  413. _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
  414. {
  415. const auto __kk = _Abi::_S_masked(__k._M_data);
  416. if constexpr (sizeof(__k) == 16)
  417. {
  418. const auto __x = __vector_bitcast<long long>(__kk);
  419. return (__x[0] | __x[1]) == 0;
  420. }
  421. else if constexpr (sizeof(__k) <= 8)
  422. return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
  423. else
  424. __assert_unreachable<_Tp>();
  425. }
  426. // }}}
  427. // _S_some_of {{{
  428. template <typename _Tp>
  429. _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
  430. {
  431. if constexpr (sizeof(__k) <= 8)
  432. {
  433. const auto __kk = __vector_bitcast<char>(__k._M_data)
  434. | ~__vector_bitcast<char>(
  435. _Abi::template _S_implicit_mask<_Tp>());
  436. using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
  437. return __bit_cast<_Up>(__kk) + 1 > 1;
  438. }
  439. else
  440. return _Base::_S_some_of(__k);
  441. }
  442. // }}}
  443. // _S_popcount {{{
  444. template <typename _Tp>
  445. _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
  446. {
  447. if constexpr (sizeof(_Tp) == 1)
  448. {
  449. const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
  450. int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
  451. return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
  452. int8x8_t())[0];
  453. }
  454. else if constexpr (sizeof(_Tp) == 2)
  455. {
  456. const auto __s16 = __vector_bitcast<short>(__k._M_data);
  457. int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
  458. return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
  459. }
  460. else if constexpr (sizeof(_Tp) == 4)
  461. {
  462. const auto __s32 = __vector_bitcast<int>(__k._M_data);
  463. int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
  464. return -vpadd_s32(__tmp, int32x2_t())[0];
  465. }
  466. else if constexpr (sizeof(_Tp) == 8)
  467. {
  468. static_assert(sizeof(__k) == 16);
  469. const auto __s64 = __vector_bitcast<long>(__k._M_data);
  470. return -(__s64[0] + __s64[1]);
  471. }
  472. }
  473. // }}}
  474. // _S_find_first_set {{{
  475. template <typename _Tp>
  476. _GLIBCXX_SIMD_INTRINSIC static int
  477. _S_find_first_set(simd_mask<_Tp, _Abi> __k)
  478. {
  479. // TODO: the _Base implementation is not optimal for NEON
  480. return _Base::_S_find_first_set(__k);
  481. }
  482. // }}}
  483. // _S_find_last_set {{{
  484. template <typename _Tp>
  485. _GLIBCXX_SIMD_INTRINSIC static int
  486. _S_find_last_set(simd_mask<_Tp, _Abi> __k)
  487. {
  488. // TODO: the _Base implementation is not optimal for NEON
  489. return _Base::_S_find_last_set(__k);
  490. }
  491. // }}}
  492. }; // }}}
  493. _GLIBCXX_SIMD_END_NAMESPACE
  494. #endif // __cplusplus >= 201703L
  495. #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
  496. // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80