| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530 |
- // Simd NEON specific implementations -*- C++ -*-
- // Copyright (C) 2020-2021 Free Software Foundation, Inc.
- //
- // This file is part of the GNU ISO C++ Library. This library is free
- // software; you can redistribute it and/or modify it under the
- // terms of the GNU General Public License as published by the
- // Free Software Foundation; either version 3, or (at your option)
- // any later version.
- // This library is distributed in the hope that it will be useful,
- // but WITHOUT ANY WARRANTY; without even the implied warranty of
- // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- // GNU General Public License for more details.
- // Under Section 7 of GPL version 3, you are granted additional
- // permissions described in the GCC Runtime Library Exception, version
- // 3.1, as published by the Free Software Foundation.
- // You should have received a copy of the GNU General Public License and
- // a copy of the GCC Runtime Library Exception along with this program;
- // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- // <http://www.gnu.org/licenses/>.
- #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
- #define _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
- #if __cplusplus >= 201703L
- #if !_GLIBCXX_SIMD_HAVE_NEON
- #error "simd_neon.h may only be included when NEON on ARM is available"
- #endif
- _GLIBCXX_SIMD_BEGIN_NAMESPACE
- // _CommonImplNeon {{{
- struct _CommonImplNeon : _CommonImplBuiltin
- {
- // _S_store {{{
- using _CommonImplBuiltin::_S_store;
- // }}}
- };
- // }}}
- // _SimdImplNeon {{{
- template <typename _Abi>
- struct _SimdImplNeon : _SimdImplBuiltin<_Abi>
- {
- using _Base = _SimdImplBuiltin<_Abi>;
- template <typename _Tp>
- using _MaskMember = typename _Base::template _MaskMember<_Tp>;
- template <typename _Tp>
- static constexpr size_t _S_max_store_size = 16;
- // _S_masked_load {{{
- template <typename _Tp, size_t _Np, typename _Up>
- static inline _SimdWrapper<_Tp, _Np>
- _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
- const _Up* __mem) noexcept
- {
- __execute_n_times<_Np>([&](auto __i) {
- if (__k[__i] != 0)
- __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
- });
- return __merge;
- }
- // }}}
- // _S_masked_store_nocvt {{{
- template <typename _Tp, size_t _Np>
- _GLIBCXX_SIMD_INTRINSIC static void
- _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem,
- _MaskMember<_Tp> __k)
- {
- __execute_n_times<_Np>([&](auto __i) {
- if (__k[__i] != 0)
- __mem[__i] = __v[__i];
- });
- }
- // }}}
- // _S_reduce {{{
- template <typename _Tp, typename _BinaryOperation>
- _GLIBCXX_SIMD_INTRINSIC static _Tp
- _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
- {
- constexpr size_t _Np = __x.size();
- if constexpr (sizeof(__x) == 16 && _Np >= 4
- && !_Abi::template _S_is_partial<_Tp>)
- {
- const auto __halves = split<simd<_Tp, simd_abi::_Neon<8>>>(__x);
- const auto __y = __binary_op(__halves[0], __halves[1]);
- return _SimdImplNeon<simd_abi::_Neon<8>>::_S_reduce(
- __y, static_cast<_BinaryOperation&&>(__binary_op));
- }
- else if constexpr (_Np == 8)
- {
- __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
- __vector_permute<1, 0, 3, 2, 5, 4, 7, 6>(
- __x._M_data)));
- __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
- __vector_permute<3, 2, 1, 0, 7, 6, 5, 4>(
- __x._M_data)));
- __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
- __vector_permute<7, 6, 5, 4, 3, 2, 1, 0>(
- __x._M_data)));
- return __x[0];
- }
- else if constexpr (_Np == 4)
- {
- __x
- = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
- __vector_permute<1, 0, 3, 2>(__x._M_data)));
- __x
- = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
- __vector_permute<3, 2, 1, 0>(__x._M_data)));
- return __x[0];
- }
- else if constexpr (_Np == 2)
- {
- __x = __binary_op(__x, _Base::template _M_make_simd<_Tp, _Np>(
- __vector_permute<1, 0>(__x._M_data)));
- return __x[0];
- }
- else
- return _Base::_S_reduce(__x,
- static_cast<_BinaryOperation&&>(__binary_op));
- }
- // }}}
- // math {{{
- // _S_sqrt {{{
- template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
- _GLIBCXX_SIMD_INTRINSIC static _Tp _S_sqrt(_Tp __x)
- {
- if constexpr (__have_neon_a64)
- {
- const auto __intrin = __to_intrin(__x);
- if constexpr (_TVT::template _S_is<float, 2>)
- return vsqrt_f32(__intrin);
- else if constexpr (_TVT::template _S_is<float, 4>)
- return vsqrtq_f32(__intrin);
- else if constexpr (_TVT::template _S_is<double, 1>)
- return vsqrt_f64(__intrin);
- else if constexpr (_TVT::template _S_is<double, 2>)
- return vsqrtq_f64(__intrin);
- else
- __assert_unreachable<_Tp>();
- }
- else
- return _Base::_S_sqrt(__x);
- }
- // }}}
- // _S_trunc {{{
- template <typename _TW, typename _TVT = _VectorTraits<_TW>>
- _GLIBCXX_SIMD_INTRINSIC static _TW _S_trunc(_TW __x)
- {
- using _Tp = typename _TVT::value_type;
- if constexpr (__have_neon_a32)
- {
- const auto __intrin = __to_intrin(__x);
- if constexpr (_TVT::template _S_is<float, 2>)
- return vrnd_f32(__intrin);
- else if constexpr (_TVT::template _S_is<float, 4>)
- return vrndq_f32(__intrin);
- else if constexpr (_TVT::template _S_is<double, 1>)
- return vrnd_f64(__intrin);
- else if constexpr (_TVT::template _S_is<double, 2>)
- return vrndq_f64(__intrin);
- else
- __assert_unreachable<_Tp>();
- }
- else if constexpr (is_same_v<_Tp, float>)
- {
- auto __intrin = __to_intrin(__x);
- if constexpr (sizeof(__x) == 16)
- __intrin = vcvtq_f32_s32(vcvtq_s32_f32(__intrin));
- else
- __intrin = vcvt_f32_s32(vcvt_s32_f32(__intrin));
- return _Base::_S_abs(__x)._M_data < 0x1p23f
- ? __vector_bitcast<float>(__intrin)
- : __x._M_data;
- }
- else
- return _Base::_S_trunc(__x);
- }
- // }}}
- // _S_round {{{
- template <typename _Tp, size_t _Np>
- _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
- _S_round(_SimdWrapper<_Tp, _Np> __x)
- {
- if constexpr (__have_neon_a32)
- {
- const auto __intrin = __to_intrin(__x);
- if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 8)
- return vrnda_f32(__intrin);
- else if constexpr (sizeof(_Tp) == 4 && sizeof(__x) == 16)
- return vrndaq_f32(__intrin);
- else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 8)
- return vrnda_f64(__intrin);
- else if constexpr (sizeof(_Tp) == 8 && sizeof(__x) == 16)
- return vrndaq_f64(__intrin);
- else
- __assert_unreachable<_Tp>();
- }
- else
- return _Base::_S_round(__x);
- }
- // }}}
- // _S_floor {{{
- template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
- _GLIBCXX_SIMD_INTRINSIC static _Tp _S_floor(_Tp __x)
- {
- if constexpr (__have_neon_a32)
- {
- const auto __intrin = __to_intrin(__x);
- if constexpr (_TVT::template _S_is<float, 2>)
- return vrndm_f32(__intrin);
- else if constexpr (_TVT::template _S_is<float, 4>)
- return vrndmq_f32(__intrin);
- else if constexpr (_TVT::template _S_is<double, 1>)
- return vrndm_f64(__intrin);
- else if constexpr (_TVT::template _S_is<double, 2>)
- return vrndmq_f64(__intrin);
- else
- __assert_unreachable<_Tp>();
- }
- else
- return _Base::_S_floor(__x);
- }
- // }}}
- // _S_ceil {{{
- template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
- _GLIBCXX_SIMD_INTRINSIC static _Tp _S_ceil(_Tp __x)
- {
- if constexpr (__have_neon_a32)
- {
- const auto __intrin = __to_intrin(__x);
- if constexpr (_TVT::template _S_is<float, 2>)
- return vrndp_f32(__intrin);
- else if constexpr (_TVT::template _S_is<float, 4>)
- return vrndpq_f32(__intrin);
- else if constexpr (_TVT::template _S_is<double, 1>)
- return vrndp_f64(__intrin);
- else if constexpr (_TVT::template _S_is<double, 2>)
- return vrndpq_f64(__intrin);
- else
- __assert_unreachable<_Tp>();
- }
- else
- return _Base::_S_ceil(__x);
- }
- //}}} }}}
- }; // }}}
- // _MaskImplNeonMixin {{{
- struct _MaskImplNeonMixin
- {
- using _Base = _MaskImplBuiltinMixin;
- template <typename _Tp, size_t _Np>
- _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
- _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
- {
- if (__builtin_is_constant_evaluated())
- return _Base::_S_to_bits(__x);
- using _I = __int_for_sizeof_t<_Tp>;
- if constexpr (sizeof(__x) == 16)
- {
- auto __asint = __vector_bitcast<_I>(__x);
- #ifdef __aarch64__
- [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
- #else
- [[maybe_unused]] constexpr auto __zero = decltype(__lo64(__asint))();
- #endif
- if constexpr (sizeof(_Tp) == 1)
- {
- constexpr auto __bitsel
- = __generate_from_n_evaluations<16, __vector_type_t<_I, 16>>(
- [&](auto __i) {
- return static_cast<_I>(
- __i < _Np ? (__i < 8 ? 1 << __i : 1 << (__i - 8)) : 0);
- });
- __asint &= __bitsel;
- #ifdef __aarch64__
- return __vector_bitcast<_UShort>(
- vpaddq_s8(vpaddq_s8(vpaddq_s8(__asint, __zero), __zero),
- __zero))[0];
- #else
- return __vector_bitcast<_UShort>(
- vpadd_s8(vpadd_s8(vpadd_s8(__lo64(__asint), __hi64(__asint)),
- __zero),
- __zero))[0];
- #endif
- }
- else if constexpr (sizeof(_Tp) == 2)
- {
- constexpr auto __bitsel
- = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
- [&](auto __i) {
- return static_cast<_I>(__i < _Np ? 1 << __i : 0);
- });
- __asint &= __bitsel;
- #ifdef __aarch64__
- return vaddvq_s16(__asint);
- #else
- return vpadd_s16(
- vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
- __zero)[0];
- #endif
- }
- else if constexpr (sizeof(_Tp) == 4)
- {
- constexpr auto __bitsel
- = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
- [&](auto __i) {
- return static_cast<_I>(__i < _Np ? 1 << __i : 0);
- });
- __asint &= __bitsel;
- #ifdef __aarch64__
- return vaddvq_s32(__asint);
- #else
- return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
- __zero)[0];
- #endif
- }
- else if constexpr (sizeof(_Tp) == 8)
- return (__asint[0] & 1) | (__asint[1] & 2);
- else
- __assert_unreachable<_Tp>();
- }
- else if constexpr (sizeof(__x) == 8)
- {
- auto __asint = __vector_bitcast<_I>(__x);
- [[maybe_unused]] constexpr auto __zero = decltype(__asint)();
- if constexpr (sizeof(_Tp) == 1)
- {
- constexpr auto __bitsel
- = __generate_from_n_evaluations<8, __vector_type_t<_I, 8>>(
- [&](auto __i) {
- return static_cast<_I>(__i < _Np ? 1 << __i : 0);
- });
- __asint &= __bitsel;
- #ifdef __aarch64__
- return vaddv_s8(__asint);
- #else
- return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
- __zero)[0];
- #endif
- }
- else if constexpr (sizeof(_Tp) == 2)
- {
- constexpr auto __bitsel
- = __generate_from_n_evaluations<4, __vector_type_t<_I, 4>>(
- [&](auto __i) {
- return static_cast<_I>(__i < _Np ? 1 << __i : 0);
- });
- __asint &= __bitsel;
- #ifdef __aarch64__
- return vaddv_s16(__asint);
- #else
- return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
- #endif
- }
- else if constexpr (sizeof(_Tp) == 4)
- {
- __asint &= __make_vector<_I>(0x1, 0x2);
- #ifdef __aarch64__
- return vaddv_s32(__asint);
- #else
- return vpadd_s32(__asint, __zero)[0];
- #endif
- }
- else
- __assert_unreachable<_Tp>();
- }
- else
- return _Base::_S_to_bits(__x);
- }
- };
- // }}}
- // _MaskImplNeon {{{
- template <typename _Abi>
- struct _MaskImplNeon : _MaskImplNeonMixin, _MaskImplBuiltin<_Abi>
- {
- using _MaskImplBuiltinMixin::_S_to_maskvector;
- using _MaskImplNeonMixin::_S_to_bits;
- using _Base = _MaskImplBuiltin<_Abi>;
- using _Base::_S_convert;
- // _S_all_of {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static bool _S_all_of(simd_mask<_Tp, _Abi> __k)
- {
- const auto __kk
- = __vector_bitcast<char>(__k._M_data)
- | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
- if constexpr (sizeof(__k) == 16)
- {
- const auto __x = __vector_bitcast<long long>(__kk);
- return __x[0] + __x[1] == -2;
- }
- else if constexpr (sizeof(__k) <= 8)
- return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == -1;
- else
- __assert_unreachable<_Tp>();
- }
- // }}}
- // _S_any_of {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static bool _S_any_of(simd_mask<_Tp, _Abi> __k)
- {
- const auto __kk
- = __vector_bitcast<char>(__k._M_data)
- | ~__vector_bitcast<char>(_Abi::template _S_implicit_mask<_Tp>());
- if constexpr (sizeof(__k) == 16)
- {
- const auto __x = __vector_bitcast<long long>(__kk);
- return (__x[0] | __x[1]) != 0;
- }
- else if constexpr (sizeof(__k) <= 8)
- return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) != 0;
- else
- __assert_unreachable<_Tp>();
- }
- // }}}
- // _S_none_of {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static bool _S_none_of(simd_mask<_Tp, _Abi> __k)
- {
- const auto __kk = _Abi::_S_masked(__k._M_data);
- if constexpr (sizeof(__k) == 16)
- {
- const auto __x = __vector_bitcast<long long>(__kk);
- return (__x[0] | __x[1]) == 0;
- }
- else if constexpr (sizeof(__k) <= 8)
- return __bit_cast<__int_for_sizeof_t<decltype(__kk)>>(__kk) == 0;
- else
- __assert_unreachable<_Tp>();
- }
- // }}}
- // _S_some_of {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static bool _S_some_of(simd_mask<_Tp, _Abi> __k)
- {
- if constexpr (sizeof(__k) <= 8)
- {
- const auto __kk = __vector_bitcast<char>(__k._M_data)
- | ~__vector_bitcast<char>(
- _Abi::template _S_implicit_mask<_Tp>());
- using _Up = make_unsigned_t<__int_for_sizeof_t<decltype(__kk)>>;
- return __bit_cast<_Up>(__kk) + 1 > 1;
- }
- else
- return _Base::_S_some_of(__k);
- }
- // }}}
- // _S_popcount {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static int _S_popcount(simd_mask<_Tp, _Abi> __k)
- {
- if constexpr (sizeof(_Tp) == 1)
- {
- const auto __s8 = __vector_bitcast<_SChar>(__k._M_data);
- int8x8_t __tmp = __lo64(__s8) + __hi64z(__s8);
- return -vpadd_s8(vpadd_s8(vpadd_s8(__tmp, int8x8_t()), int8x8_t()),
- int8x8_t())[0];
- }
- else if constexpr (sizeof(_Tp) == 2)
- {
- const auto __s16 = __vector_bitcast<short>(__k._M_data);
- int16x4_t __tmp = __lo64(__s16) + __hi64z(__s16);
- return -vpadd_s16(vpadd_s16(__tmp, int16x4_t()), int16x4_t())[0];
- }
- else if constexpr (sizeof(_Tp) == 4)
- {
- const auto __s32 = __vector_bitcast<int>(__k._M_data);
- int32x2_t __tmp = __lo64(__s32) + __hi64z(__s32);
- return -vpadd_s32(__tmp, int32x2_t())[0];
- }
- else if constexpr (sizeof(_Tp) == 8)
- {
- static_assert(sizeof(__k) == 16);
- const auto __s64 = __vector_bitcast<long>(__k._M_data);
- return -(__s64[0] + __s64[1]);
- }
- }
- // }}}
- // _S_find_first_set {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static int
- _S_find_first_set(simd_mask<_Tp, _Abi> __k)
- {
- // TODO: the _Base implementation is not optimal for NEON
- return _Base::_S_find_first_set(__k);
- }
- // }}}
- // _S_find_last_set {{{
- template <typename _Tp>
- _GLIBCXX_SIMD_INTRINSIC static int
- _S_find_last_set(simd_mask<_Tp, _Abi> __k)
- {
- // TODO: the _Base implementation is not optimal for NEON
- return _Base::_S_find_last_set(__k);
- }
- // }}}
- }; // }}}
- _GLIBCXX_SIMD_END_NAMESPACE
- #endif // __cplusplus >= 201703L
- #endif // _GLIBCXX_EXPERIMENTAL_SIMD_NEON_H_
- // vim: foldmethod=marker sw=2 noet ts=8 sts=2 tw=80
|