| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395 |
- /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- #include "tensorflow/lite/kernels/internal/quantization_util.h"
- #include <algorithm>
- #include <cmath>
- #include <limits>
- #include "tensorflow/lite/kernels/internal/compatibility.h"
- #include "tensorflow/lite/kernels/internal/cppmath.h"
- namespace tflite {
- namespace {
- // These constants are used to manipulate the binary representation of doubles.
- // Double-precision binary64 floating point format is:
- // Bit | 63 | 62-52 | 51-0 |
- // | Sign | Exponent | Fraction |
- // To avoid 64-bit integers as much as possible, I break this into high and
- // low 32-bit chunks. High is:
- // Bit | 31 | 30-20 | 19-0 |
- // | Sign | Exponent | High Fraction |
- // Low is:
- // Bit | 31-0 |
- // | Low Fraction |
- // We then access the components through logical bit-wise operations to
- // extract the parts needed, with the positions and masks derived from the
- // layout shown above.
- constexpr uint64_t kSignMask = 0x8000000000000000LL;
- constexpr uint64_t kExponentMask = 0x7ff0000000000000LL;
- constexpr int32_t kExponentShift = 52;
- constexpr int32_t kExponentBias = 1023;
- constexpr uint32_t kExponentIsBadNum = 0x7ff;
- constexpr uint64_t kFractionMask = 0x000fffffffc00000LL;
- constexpr uint32_t kFractionShift = 22;
- constexpr uint32_t kFractionRoundingMask = 0x003fffff;
- constexpr uint32_t kFractionRoundingThreshold = 0x00200000;
- } // namespace
- void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
- int* shift) {
- if (double_multiplier == 0.) {
- *quantized_multiplier = 0;
- *shift = 0;
- return;
- }
- #ifdef TFLITE_EMULATE_FLOAT
- // If we're trying to avoid the use of floating-point instructions (for
- // example on microcontrollers) then use an alternative implementation
- // that only requires integer and bitwise operations. To enable this, you
- // need to set the define during the build process for your platform.
- int64_t q_fixed = IntegerFrExp(double_multiplier, shift);
- #else // TFLITE_EMULATE_FLOAT
- const double q = std::frexp(double_multiplier, shift);
- auto q_fixed = static_cast<int64_t>(TfLiteRound(q * (1ll << 31)));
- #endif // TFLITE_EMULATE_FLOAT
- TFLITE_CHECK(q_fixed <= (1ll << 31));
- if (q_fixed == (1ll << 31)) {
- q_fixed /= 2;
- ++*shift;
- }
- TFLITE_CHECK_LE(q_fixed, std::numeric_limits<int32_t>::max());
- // A shift amount smaller than -31 would cause all bits to be shifted out
- // and thus all results would be zero. We implement that instead with
- // q_fixed==0, so as to avoid hitting issues with right-shift
- // operations with shift amounts greater than 31. Note that this happens
- // roughly when abs(double_multiplier) < 2^-31 and the present handling means
- // that we're effectively flushing tiny double_multiplier's to zero.
- // We could conceivably handle values in the range (roughly) [32, 63]
- // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
- // the present handling is just doing 'flush denormals to zero'. We could
- // reconsider and actually generate nonzero denormals if a need arises.
- if (*shift < -31) {
- *shift = 0;
- q_fixed = 0;
- }
- *quantized_multiplier = static_cast<int32_t>(q_fixed);
- }
- void QuantizeMultiplierGreaterThanOne(double double_multiplier,
- int32_t* quantized_multiplier,
- int* left_shift) {
- TFLITE_CHECK_GT(double_multiplier, 1.);
- QuantizeMultiplier(double_multiplier, quantized_multiplier, left_shift);
- TFLITE_CHECK_GE(*left_shift, 0);
- }
- void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
- int32_t* quantized_multiplier,
- int* left_shift) {
- TFLITE_CHECK_LT(double_multiplier, 1.);
- TFLITE_CHECK_GT(double_multiplier, 0.);
- int shift;
- QuantizeMultiplier(double_multiplier, quantized_multiplier, &shift);
- TFLITE_CHECK_LE(shift, 0);
- *left_shift = shift;
- }
- int64_t IntegerFrExp(double input, int* shift) {
- // Make sure our assumptions about the double layout hold.
- TFLITE_CHECK_EQ(8, sizeof(double));
- // We want to access the bits of the input double value directly, which is
- // tricky to do safely, so use a union to handle the casting.
- union {
- double double_value;
- uint64_t double_as_uint;
- } cast_union;
- cast_union.double_value = input;
- const uint64_t u = cast_union.double_as_uint;
- // If the bitfield is all zeros apart from the sign bit, this is a normalized
- // zero value, so return standard values for this special case.
- if ((u & ~kSignMask) == 0) {
- *shift = 0;
- return 0;
- }
- // Deal with NaNs and Infs, which are always indicated with a fixed pattern in
- // the exponent, and distinguished by whether the fractions are zero or
- // non-zero.
- const uint32_t exponent_part = ((u & kExponentMask) >> kExponentShift);
- if (exponent_part == kExponentIsBadNum) {
- *shift = std::numeric_limits<int>::max();
- if (u & kFractionMask) {
- // NaN, so just return zero (with the exponent set to INT_MAX).
- return 0;
- } else {
- // Infinity, so return +/- INT_MAX.
- if (u & kSignMask) {
- return std::numeric_limits<int64_t>::min();
- } else {
- return std::numeric_limits<int64_t>::max();
- }
- }
- }
- // The shift is fairly easy to extract from the high bits of the double value,
- // just by masking it out and applying a bias. The std::frexp() implementation
- // always returns values between 0.5 and 1.0 though, whereas the exponent
- // assumes 1.0 to 2.0 is the standard range, so I add on one to match that
- // interface.
- *shift = (exponent_part - kExponentBias) + 1;
- // There's an implicit high bit in the double format definition, so make sure
- // we include that at the top, and then reconstruct the rest of the fractional
- // value from the remaining fragments.
- int64_t fraction = 0x40000000 + ((u & kFractionMask) >> kFractionShift);
- // We're cutting off some bits at the bottom, so to exactly match the standard
- // frexp implementation here we'll apply rounding by adding one to the least
- // significant bit of the result if the discarded portion is over half of the
- // maximum.
- if ((u & kFractionRoundingMask) > kFractionRoundingThreshold) {
- fraction += 1;
- }
- // Negate the fraction if the sign bit was set.
- if (u & kSignMask) {
- fraction *= -1;
- }
- return fraction;
- }
- double DoubleFromFractionAndShift(int64_t fraction, int shift) {
- union {
- double double_value;
- uint64_t double_as_uint;
- } result;
- // Detect NaNs and infinities.
- if (shift == std::numeric_limits<int>::max()) {
- if (fraction == 0) {
- return std::numeric_limits<double>::quiet_NaN();
- } else if (fraction > 0) {
- return std::numeric_limits<double>::infinity();
- } else {
- return -std::numeric_limits<double>::infinity();
- }
- }
- // Return a normalized zero for a zero fraction.
- if (fraction == 0) {
- result.double_as_uint = 0;
- return result.double_value;
- }
- bool is_negative = (fraction < 0);
- int64_t encoded_fraction = is_negative ? -fraction : fraction;
- int64_t encoded_shift = (shift - 1);
- while (encoded_fraction < 0x40000000) {
- encoded_fraction *= 2;
- encoded_shift -= 1;
- }
- while (encoded_fraction > 0x80000000) {
- encoded_fraction /= 2;
- encoded_shift += 1;
- }
- encoded_fraction -= 0x40000000;
- if (encoded_shift < -1022) {
- encoded_shift = -1023;
- } else if (encoded_shift > 1022) {
- encoded_shift = 1023;
- }
- encoded_shift += kExponentBias;
- uint64_t encoded_sign = is_negative ? kSignMask : 0;
- result.double_as_uint = encoded_sign | (encoded_shift << kExponentShift) |
- (encoded_fraction << kFractionShift);
- return result.double_value;
- }
- double IntegerDoubleMultiply(double a, double b) {
- int a_shift;
- const int64_t a_fraction = IntegerFrExp(a, &a_shift);
- int b_shift;
- const int64_t b_fraction = IntegerFrExp(b, &b_shift);
- // Detect NaNs and infinities.
- if (a_shift == std::numeric_limits<int>::max() ||
- (b_shift == std::numeric_limits<int>::max())) {
- return std::numeric_limits<double>::quiet_NaN();
- }
- const int result_shift = a_shift + b_shift + 1;
- const int64_t result_fraction = (a_fraction * b_fraction) >> 32;
- return DoubleFromFractionAndShift(result_fraction, result_shift);
- }
- int IntegerDoubleCompare(double a, double b) {
- int a_shift;
- const int64_t a_fraction = IntegerFrExp(a, &a_shift);
- int b_shift;
- const int64_t b_fraction = IntegerFrExp(b, &b_shift);
- // Detect NaNs and infinities.
- if (a_shift == std::numeric_limits<int>::max() ||
- (b_shift == std::numeric_limits<int>::max())) {
- return 1;
- }
- if ((a_fraction == 0) && (b_fraction < 0)) {
- return 1;
- } else if ((a_fraction < 0) && (b_fraction == 0)) {
- return -1;
- } else if (a_shift < b_shift) {
- return -1;
- } else if (a_shift > b_shift) {
- return 1;
- } else if (a_fraction < b_fraction) {
- return -1;
- } else if (a_fraction > b_fraction) {
- return 1;
- } else {
- return 0;
- }
- }
- void PreprocessSoftmaxScaling(double beta, double input_scale,
- int input_integer_bits,
- int32_t* quantized_multiplier, int* left_shift) {
- // If the overall multiplier (input and beta) is large, then exp() of an
- // input difference of 1 scaled by this will be large. In other words, we
- // can cap the multiplier and know that, when it is used, the output will be
- // (round to) zero wherever the input is not at the maximum value.
- // If the overall scale is less than one, and input_integer_bits=0, then the
- // result is double equivalent of Q0.31 (actually with more precision). Thus
- // this generates a Q(input_integer_bits).(31-input_integer_bits)
- // representation.
- #ifdef TFLITE_EMULATE_FLOAT
- const double input_beta = IntegerDoubleMultiply(beta, input_scale);
- int shift;
- int64_t fraction = IntegerFrExp(input_beta, &shift);
- shift += (31 - input_integer_bits);
- double input_beta_real_multiplier =
- DoubleFromFractionAndShift(fraction, shift);
- if (IntegerDoubleCompare(input_beta_real_multiplier, (1ll << 31) - 1.0) > 0) {
- input_beta_real_multiplier = (1ll << 31) - 1.0;
- }
- #else // TFLITE_EMULATE_FLOAT
- const double input_beta_real_multiplier = std::min(
- beta * input_scale * (1 << (31 - input_integer_bits)), (1ll << 31) - 1.0);
- #endif // TFLITE_EMULATE_FLOAT
- QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
- quantized_multiplier, left_shift);
- }
- void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
- int input_integer_bits,
- int32_t* quantized_multiplier,
- int* left_shift,
- int32_t* reverse_scaling_divisor,
- int* reverse_scaling_left_shift) {
- PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
- quantized_multiplier, left_shift);
- // Also calculate what amounts to the inverse scaling factor for the input.
- const double real_reverse_scaling_divisor =
- (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
- tflite::QuantizeMultiplierSmallerThanOneExp(real_reverse_scaling_divisor,
- reverse_scaling_divisor,
- reverse_scaling_left_shift);
- }
- int CalculateInputRadius(int input_integer_bits, int input_left_shift,
- int total_signed_bits) {
- #ifdef TFLITE_EMULATE_FLOAT
- int64_t result = (1 << input_integer_bits) - 1;
- result <<= (total_signed_bits - input_integer_bits);
- result >>= input_left_shift;
- return result;
- #else // TFLITE_EMULATE_FLOAT
- const double max_input_rescaled =
- 1.0 * ((1 << input_integer_bits) - 1) *
- (1ll << (total_signed_bits - input_integer_bits)) /
- (1ll << input_left_shift);
- // Tighten bound using floor. Suppose that we could use the exact value.
- // After scaling the difference, the result would be at the maximum. Thus we
- // must ensure that our value has lower magnitude.
- return static_cast<int>(std::floor(max_input_rescaled));
- #endif // TFLITE_EMULATE_FLOAT
- }
- void NudgeQuantizationRange(const float min, const float max,
- const int quant_min, const int quant_max,
- float* nudged_min, float* nudged_max,
- float* nudged_scale) {
- // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
- const float quant_min_float = static_cast<float>(quant_min);
- const float quant_max_float = static_cast<float>(quant_max);
- *nudged_scale = (max - min) / (quant_max_float - quant_min_float);
- const float zero_point_from_min = quant_min_float - min / *nudged_scale;
- uint16_t nudged_zero_point;
- if (zero_point_from_min < quant_min_float) {
- nudged_zero_point = static_cast<uint16_t>(quant_min);
- } else if (zero_point_from_min > quant_max_float) {
- nudged_zero_point = static_cast<uint16_t>(quant_max);
- } else {
- nudged_zero_point = static_cast<uint16_t>(TfLiteRound(zero_point_from_min));
- }
- *nudged_min = (quant_min_float - nudged_zero_point) * (*nudged_scale);
- *nudged_max = (quant_max_float - nudged_zero_point) * (*nudged_scale);
- }
- void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
- const float nudged_max, const float* input_data,
- float* output_data, const float size) {
- // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
- const float inv_nudged_scale = 1.0f / nudged_scale;
- for (int i = 0; i < size; i++) {
- const float src_val = input_data[i];
- const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
- const float clamped_shifted = clamped - nudged_min;
- const float dst_val =
- TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
- nudged_min;
- output_data[i] = dst_val;
- }
- }
- bool CheckedLog2(const float x, int* log2_result) {
- // Using TfLiteRound instead of std::round and std::log instead of
- // std::log2 to work around these functions being missing in a toolchain
- // used in some TensorFlow tests as of May 2018.
- const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
- const float x_log2_rounded = TfLiteRound(x_log2);
- const float x_log2_fracpart = x_log2 - x_log2_rounded;
- *log2_result = static_cast<int>(x_log2_rounded);
- return std::abs(x_log2_fracpart) < 1e-3f;
- }
- void QuantizeMultiplierArray(const double* effective_scales, size_t size,
- int32_t* effective_scale_significand,
- int* effective_shift) {
- for (size_t i = 0; i < size; ++i) {
- QuantizeMultiplier(effective_scales[i], &effective_scale_significand[i],
- &effective_shift[i]);
- }
- }
- } // namespace tflite
|