| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357 |
- // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
- //
- // Licensed under the Apache License, Version 2.0 (the "License");
- // you may not use this file except in compliance with the License.
- // You may obtain a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS,
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- // See the License for the specific language governing permissions and
- // limitations under the License.
- // fixedpoint_neon.h: optimized NEON specializations of the templates
- // in fixedpoint.h.
- #ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
- #define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
- #include <arm_neon.h>
- namespace gemmlowp {
- template <>
- struct FixedPointRawTypeTraits<int32x4_t> {
- typedef std::int32_t ScalarRawType;
- static constexpr int kLanes = 4;
- };
- template <>
- struct FixedPointRawTypeTraits<int16x8_t> {
- typedef std::int16_t ScalarRawType;
- static constexpr int kLanes = 8;
- };
- template <>
- inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
- return vandq_s32(a, b);
- }
- template <>
- inline int16x8_t BitAnd(int16x8_t a, int16x8_t b) {
- return vandq_s16(a, b);
- }
- template <>
- inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
- return vorrq_s32(a, b);
- }
- template <>
- inline int16x8_t BitOr(int16x8_t a, int16x8_t b) {
- return vorrq_s16(a, b);
- }
- template <>
- inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
- return veorq_s32(a, b);
- }
- template <>
- inline int16x8_t BitXor(int16x8_t a, int16x8_t b) {
- return veorq_s16(a, b);
- }
- template <>
- inline int32x4_t BitNot(int32x4_t a) {
- return veorq_s32(a, vdupq_n_s32(-1));
- }
- template <>
- inline int16x8_t BitNot(int16x8_t a) {
- return veorq_s16(a, vdupq_n_s16(-1));
- }
- template <>
- inline int32x4_t Add(int32x4_t a, int32x4_t b) {
- return vaddq_s32(a, b);
- }
- template <>
- inline int16x8_t Add(int16x8_t a, int16x8_t b) {
- return vaddq_s16(a, b);
- }
- template <>
- inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
- return vsubq_s32(a, b);
- }
- template <>
- inline int16x8_t Sub(int16x8_t a, int16x8_t b) {
- return vsubq_s16(a, b);
- }
- template <>
- inline int32x4_t Neg(int32x4_t a) {
- return vnegq_s32(a);
- }
- template <>
- inline int16x8_t Neg(int16x8_t a) {
- return vnegq_s16(a);
- }
- template <>
- inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
- return vshlq_s32(a, vdupq_n_s32(offset));
- }
- template <>
- inline int16x8_t ShiftLeft(int16x8_t a, int offset) {
- return vshlq_s16(a, vdupq_n_s16(offset));
- }
- template <>
- inline int32x4_t ShiftLeft(int32x4_t a, int32x4_t offset) {
- return vshlq_s32(a, offset);
- }
- template <>
- inline int16x8_t ShiftLeft(int16x8_t a, int16x8_t offset) {
- return vshlq_s16(a, offset);
- }
- template <>
- inline int32x4_t ShiftRight(int32x4_t a, int offset) {
- return vshlq_s32(a, vdupq_n_s32(-offset));
- }
- template <>
- inline int16x8_t ShiftRight(int16x8_t a, int offset) {
- return vshlq_s16(a, vdupq_n_s16(-offset));
- }
- template <>
- inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
- int32x4_t else_val) {
- return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
- }
- template <>
- inline int16x8_t SelectUsingMask(int16x8_t if_mask, int16x8_t then_val,
- int16x8_t else_val) {
- return vbslq_s16(vreinterpretq_u16_s16(if_mask), then_val, else_val);
- }
- template <>
- inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
- return vreinterpretq_s32_u32(vceqq_s32(a, b));
- }
- template <>
- inline int16x8_t MaskIfEqual(int16x8_t a, int16x8_t b) {
- return vreinterpretq_s16_u16(vceqq_s16(a, b));
- }
- template <>
- inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
- return BitNot(MaskIfEqual(a, b));
- }
- template <>
- inline int16x8_t MaskIfNotEqual(int16x8_t a, int16x8_t b) {
- return BitNot(MaskIfEqual(a, b));
- }
- template <>
- inline int32x4_t MaskIfZero(int32x4_t a) {
- return MaskIfEqual(a, vdupq_n_s32(0));
- }
- template <>
- inline int16x8_t MaskIfZero(int16x8_t a) {
- return MaskIfEqual(a, vdupq_n_s16(0));
- }
- template <>
- inline int32x4_t MaskIfNonZero(int32x4_t a) {
- return vreinterpretq_s32_u32(vtstq_s32(a, a));
- }
- template <>
- inline int16x8_t MaskIfNonZero(int16x8_t a) {
- return vreinterpretq_s16_u16(vtstq_s16(a, a));
- }
- template <>
- inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
- return vreinterpretq_s32_u32(vcgtq_s32(a, b));
- }
- template <>
- inline int16x8_t MaskIfGreaterThan(int16x8_t a, int16x8_t b) {
- return vreinterpretq_s16_u16(vcgtq_s16(a, b));
- }
- template <>
- inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
- return vreinterpretq_s32_u32(vcgeq_s32(a, b));
- }
- template <>
- inline int16x8_t MaskIfGreaterThanOrEqual(int16x8_t a, int16x8_t b) {
- return vreinterpretq_s16_u16(vcgeq_s16(a, b));
- }
- template <>
- inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
- return vreinterpretq_s32_u32(vcltq_s32(a, b));
- }
- template <>
- inline int16x8_t MaskIfLessThan(int16x8_t a, int16x8_t b) {
- return vreinterpretq_s16_u16(vcltq_s16(a, b));
- }
- template <>
- inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
- return vreinterpretq_s32_u32(vcleq_s32(a, b));
- }
- template <>
- inline int16x8_t MaskIfLessThanOrEqual(int16x8_t a, int16x8_t b) {
- return vreinterpretq_s16_u16(vcleq_s16(a, b));
- }
- template <>
- inline bool All(int32x4_t a) {
- a = vandq_s32(a, vextq_s32(a, a, 1));
- a = vandq_s32(a, vextq_s32(a, a, 2));
- return vgetq_lane_s32(a, 0);
- }
- template <>
- inline bool All(int16x8_t a) {
- a = vandq_s16(a, vextq_s16(a, a, 1));
- a = vandq_s16(a, vextq_s16(a, a, 2));
- a = vandq_s16(a, vextq_s16(a, a, 4));
- return vgetq_lane_s16(a, 0);
- }
- template <>
- inline bool Any(int32x4_t a) {
- a = vorrq_s32(a, vextq_s32(a, a, 1));
- a = vorrq_s32(a, vextq_s32(a, a, 2));
- return vgetq_lane_s32(a, 0);
- }
- template <>
- inline bool Any(int16x8_t a) {
- a = vorrq_s16(a, vextq_s16(a, a, 1));
- a = vorrq_s16(a, vextq_s16(a, a, 2));
- a = vorrq_s16(a, vextq_s16(a, a, 4));
- return vgetq_lane_s16(a, 0);
- }
- template <>
- inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
- return vrhaddq_s32(a, b);
- }
- template <>
- inline int16x8_t RoundingHalfSum(int16x8_t a, int16x8_t b) {
- return vrhaddq_s16(a, b);
- }
- template <>
- inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
- return vqrdmulhq_s32(a, b);
- }
- template <>
- inline int16x8_t SaturatingRoundingDoublingHighMul(int16x8_t a, int16x8_t b) {
- return vqrdmulhq_s16(a, b);
- }
- template <>
- inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
- const int32x4_t shift_vec = vdupq_n_s32(-exponent);
- const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
- const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
- return vrshlq_s32(fixed_up_x, shift_vec);
- }
- template <>
- inline int16x8_t RoundingDivideByPOT(int16x8_t x, int exponent) {
- const int16x8_t shift_vec = vdupq_n_s16(-exponent);
- const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
- const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
- return vrshlq_s16(fixed_up_x, shift_vec);
- }
- template <>
- inline int32x4_t RoundingDivideByPOT(int32x4_t x, int32x4_t exponent) {
- const int32x4_t shift_vec = vnegq_s32(exponent);
- const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
- const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
- return vrshlq_s32(fixed_up_x, shift_vec);
- }
- template <>
- inline int16x8_t RoundingDivideByPOT(int16x8_t x, int16x8_t exponent) {
- const int16x8_t shift_vec = vnegq_s16(exponent);
- const int16x8_t fixup = vshrq_n_s16(vandq_s16(x, shift_vec), 15);
- const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
- return vrshlq_s16(fixed_up_x, shift_vec);
- }
- template <int Exponent>
- struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
- static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
- };
- template <int Exponent>
- struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
- static int32x4_t eval(int32x4_t x) {
- const int32x4_t fixup = vshrq_n_s32(x, 31);
- const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
- return vrshrq_n_s32(fixed_up_x, -Exponent);
- }
- };
- template <int Exponent>
- struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, 1> {
- static int16x8_t eval(int16x8_t x) { return vqshlq_n_s16(x, Exponent); }
- };
- template <int Exponent>
- struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int16x8_t, -1> {
- static int16x8_t eval(int16x8_t x) {
- const int16x8_t fixup = vshrq_n_s16(x, 15);
- const int16x8_t fixed_up_x = vqaddq_s16(x, fixup);
- return vrshrq_n_s16(fixed_up_x, -Exponent);
- }
- };
- template <>
- inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
- return vdupq_n_s32(x);
- }
- template <>
- inline int16x8_t Dup<int16x8_t>(std::int16_t x) {
- return vdupq_n_s16(x);
- }
- // So far this is only needed for int16.
- template <>
- inline int16x8_t SaturatingAdd(int16x8_t a, int16x8_t b) {
- return vqaddq_s16(a, b);
- }
- } // end namespace gemmlowp
- #endif // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
|