quantization_util.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
  13. #define TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
  14. #include <cmath>
  15. #include <cstdint>
  16. #include <limits>
  17. #include "tensorflow/lite/kernels/internal/compatibility.h"
  18. #include "tensorflow/lite/kernels/internal/cppmath.h"
  19. #include "tensorflow/lite/kernels/internal/types.h"
  20. namespace tflite {
  21. // Given the min and max values of a float array, return
  22. // reasonable quantization parameters to use for this array.
  23. template <typename T>
  24. QuantizationParams ChooseQuantizationParams(double rmin, double rmax,
  25. bool narrow_range) {
  26. const T qmin = std::numeric_limits<T>::min() + (narrow_range ? 1 : 0);
  27. const T qmax = std::numeric_limits<T>::max();
  28. const double qmin_double = qmin;
  29. const double qmax_double = qmax;
  30. // 0 should always be a representable value. Let's assume that the initial
  31. // min,max range contains 0.
  32. TFLITE_CHECK_LE(rmin, 0.);
  33. TFLITE_CHECK_GE(rmax, 0.);
  34. if (rmin == rmax) {
  35. // Special case where the min,max range is a point. Should be {0}.
  36. TFLITE_CHECK_EQ(rmin, 0.);
  37. TFLITE_CHECK_EQ(rmax, 0.);
  38. QuantizationParams quantization_params;
  39. quantization_params.zero_point = 0;
  40. quantization_params.scale = 0.;
  41. return quantization_params;
  42. }
  43. // General case.
  44. //
  45. // First determine the scale.
  46. const double scale = (rmax - rmin) / (qmax_double - qmin_double);
  47. // Zero-point computation.
  48. // First the initial floating-point computation. The zero-point can be
  49. // determined from solving an affine equation for any known pair
  50. // (real value, corresponding quantized value).
  51. // We know two such pairs: (rmin, qmin) and (rmax, qmax).
  52. // The arithmetic error on the zero point computed from either pair
  53. // will be roughly machine_epsilon * (sum of absolute values of terms)
  54. // so we want to use the variant that adds the smaller terms.
  55. const double zero_point_from_min = qmin_double - rmin / scale;
  56. const double zero_point_from_max = qmax_double - rmax / scale;
  57. const double zero_point_from_min_error =
  58. std::abs(qmin_double) + std::abs(rmin / scale);
  59. const double zero_point_from_max_error =
  60. std::abs(qmax_double) + std::abs(rmax / scale);
  61. const double zero_point_double =
  62. zero_point_from_min_error < zero_point_from_max_error
  63. ? zero_point_from_min
  64. : zero_point_from_max;
  65. // Now we need to nudge the zero point to be an integer
  66. // (our zero points are integer, and this is motivated by the requirement
  67. // to be able to represent the real value "0" exactly as a quantized value,
  68. // which is required in multiple places, for example in Im2col with SAME
  69. // padding).
  70. T nudged_zero_point = 0;
  71. if (zero_point_double < qmin_double) {
  72. nudged_zero_point = qmin;
  73. } else if (zero_point_double > qmax_double) {
  74. nudged_zero_point = qmax;
  75. } else {
  76. nudged_zero_point = static_cast<T>(round(zero_point_double));
  77. }
  78. // The zero point should always be in the range of quantized value,
  79. // [qmin, qmax].
  80. TFLITE_CHECK_GE(nudged_zero_point, qmin);
  81. TFLITE_CHECK_LE(nudged_zero_point, qmax);
  82. // Finally, store the result nudged quantization params.
  83. QuantizationParams quantization_params;
  84. quantization_params.zero_point = nudged_zero_point;
  85. quantization_params.scale = scale;
  86. return quantization_params;
  87. }
  88. template <typename T>
  89. QuantizationParams ChooseQuantizationParams(double rmin, double rmax) {
  90. return ChooseQuantizationParams<T>(rmin, rmax, false);
  91. }
  92. // Converts a floating-point number to an integer. For all inputs x where
  93. // static_cast<IntOut>(x) is legal according to the C++ standard, the result
  94. // is identical to that cast (i.e. the result is x with its fractional part
  95. // truncated whenever that is representable as IntOut).
  96. //
  97. // static_cast would cause undefined behavior for the following cases, which
  98. // have well-defined behavior for this function:
  99. //
  100. // 1. If x is NaN, the result is zero.
  101. //
  102. // 2. If the truncated form of x is above the representable range of IntOut,
  103. // the result is std::numeric_limits<IntOut>::max().
  104. //
  105. // 3. If the truncated form of x is below the representable range of IntOut,
  106. // the result is std::numeric_limits<IntOut>::min().
  107. //
  108. // Note that cases #2 and #3 cover infinities as well as finite numbers.
  109. //
  110. // The range of FloatIn must include the range of IntOut, otherwise
  111. // the results are undefined.
  112. // TODO(sfeuz): Replace by absl::SafeCast once available.
  113. template <class IntOut, class FloatIn>
  114. IntOut SafeCast(FloatIn x) {
  115. static_assert(!std::numeric_limits<FloatIn>::is_integer,
  116. "FloatIn is integer");
  117. static_assert(std::numeric_limits<IntOut>::is_integer,
  118. "IntOut is not integer");
  119. static_assert(std::numeric_limits<IntOut>::radix == 2, "IntOut is base 2");
  120. // Special case NaN, for which the logic below doesn't work.
  121. if (std::isnan(x)) {
  122. return 0;
  123. }
  124. // Negative values all clip to zero for unsigned results.
  125. if (!std::numeric_limits<IntOut>::is_signed && x < 0) {
  126. return 0;
  127. }
  128. // Handle infinities.
  129. if (std::isinf(x)) {
  130. return x < 0 ? std::numeric_limits<IntOut>::min()
  131. : std::numeric_limits<IntOut>::max();
  132. }
  133. // Set exp such that x == f * 2^exp for some f with |f| in [0.5, 1.0),
  134. // unless x is zero in which case exp == 0. Note that this implies that the
  135. // magnitude of x is strictly less than 2^exp.
  136. int exp = 0;
  137. std::frexp(x, &exp);
  138. // Let N be the number of non-sign bits in the representation of IntOut. If
  139. // the magnitude of x is strictly less than 2^N, the truncated version of x
  140. // is representable as IntOut. The only representable integer for which this
  141. // is not the case is kMin for signed types (i.e. -2^N), but that is covered
  142. // by the fall-through below.
  143. if (exp <= std::numeric_limits<IntOut>::digits) {
  144. return x;
  145. }
  146. // Handle numbers with magnitude >= 2^N.
  147. return x < 0 ? std::numeric_limits<IntOut>::min()
  148. : std::numeric_limits<IntOut>::max();
  149. }
  150. // Decompose a double multiplier into a Q0.31 int32 representation of its
  151. // significand, and shift representation of NEGATIVE its exponent ---
  152. // this is intended as a RIGHT-shift.
  153. //
  154. // Restricted to the case where the multiplier < 1 (and non-negative).
  155. void QuantizeMultiplierSmallerThanOneExp(double double_multiplier,
  156. int32_t* quantized_multiplier,
  157. int* left_shift);
  158. // Decompose a double multiplier into a Q0.31 int32 representation of its
  159. // significand, and shift representation of its exponent.
  160. //
  161. // Restricted to the case where the multiplier > 1.
  162. void QuantizeMultiplierGreaterThanOne(double double_multiplier,
  163. int32_t* quantized_multiplier,
  164. int* left_shift);
  165. // Decompose a double multiplier into a Q0.31 int32 representation of its
  166. // significand, and shift representation of its exponent.
  167. //
  168. // Handles an arbitrary positive multiplier. The 'shift' output-value is
  169. // basically the 'floating-point exponent' of the multiplier:
  170. // Negative for a right-shift (when the multiplier is <1), positive for a
  171. // left-shift (when the multiplier is >1)
  172. void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
  173. int* shift);
  174. // Splits a double input value into a returned fraction, and a shift value from
  175. // the exponent, using only bitwise and integer operations to support
  176. // microcontrollers and other environments without floating-point support.
  177. //
  178. // This is designed to be a replacement for how std::frexp() is used within the
  179. // QuantizeMultiplier() function, and so has a different signature than the
  180. // standard version, returning a 64-bit integer rather than a double. This
  181. // result has a maximum value of 1<<31, with the fraction expressed as a
  182. // proportion of that maximum.
  183. //
  184. // std::frexp() returns NaNs and infinities unmodified, but since we're
  185. // returning integers that can't represent those values, instead we return
  186. // a shift of std::numeric_limits<int>::max() for all bad numbers, with an int64
  187. // result of 0 for NaNs, std:numeric_limits<int64_t>::max() for +INFINITY, and
  188. // std::numeric_limits<int64_t>::min() for -INFINITY. Denormalized inputs will
  189. // result in return values that end up truncating some bits at the end,
  190. // reflecting the loss of precision inherent in denormalization.
  191. int64_t IntegerFrExp(double input, int* shift);
  192. // Converts an integer fraction in the format produced by IntegerFrExp (where
  193. // 0x40000000 is 1.0) and an exponent shift (between -1022 and +1022) into an
  194. // IEEE binary64 double format result. The implementation uses only integer and
  195. // bitwise operators, so no floating point hardware support or emulation is
  196. // needed. This is here so quantized operations can run non-time-critical
  197. // preparation calculations on microcontrollers and other platforms without
  198. // float support.
  199. double DoubleFromFractionAndShift(int64_t fraction, int shift);
  200. // Performs a multiplication of two numbers in double format, using only integer
  201. // and bitwise instructions. This is aimed at supporting housekeeping functions
  202. // for quantized operations on microcontrollers without floating-point hardware.
  203. double IntegerDoubleMultiply(double a, double b);
  204. // Returns -1 if a is less than b, 0 if a and b are equal, and +1 if a is
  205. // greater than b. It is implemented using only integer and logical instructions
  206. // so that it can be easily run on microcontrollers for quantized operations.
  207. int IntegerDoubleCompare(double a, double b);
  208. // This first creates a multiplier in a double equivalent of
  209. // Q(input_integer_bits).(31-input_integer_bits) representation, with extra
  210. // precision in the double's fractional bits. It then splits the result into
  211. // significand and exponent.
  212. void PreprocessSoftmaxScaling(double beta, double input_scale,
  213. int input_integer_bits,
  214. int32_t* quantized_multiplier, int* left_shift);
  215. // Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
  216. void PreprocessLogSoftmaxScalingExp(double beta, double input_scale,
  217. int input_integer_bits,
  218. int32_t* quantized_multiplier,
  219. int* left_shift,
  220. int32_t* reverse_scaling_divisor,
  221. int* reverse_scaling_left_shift);
  222. // Calculate the largest input that will result in a within-bounds intermediate
  223. // result within MultiplyByQuantizedMultiplierGreaterThanOne. In other words,
  224. // it must not overflow before we reduce the value by multiplication by the
  225. // input multiplier. The negative radius is used as the minimum difference in
  226. // Softmax.
  227. int CalculateInputRadius(int input_integer_bits, int input_left_shift,
  228. int total_signed_bits = 31);
  229. // Nudges a min/max quantization range to ensure zero is zero.
  230. // Gymnastics with nudged zero point is to ensure that real zero maps to
  231. // an integer, which is required for e.g. zero-padding in convolutional layers.
  232. // Outputs nudged_min, nudged_max, nudged_scale.
  233. void NudgeQuantizationRange(const float min, const float max,
  234. const int quant_min, const int quant_max,
  235. float* nudged_min, float* nudged_max,
  236. float* nudged_scale);
  237. // Fake quantizes (quantizes and dequantizes) input_data using the scale,
  238. // nudged_min, and nudged_max from NudgeQuantizationRange. This matches the code
  239. // in TensorFlow's FakeQuantizeWithMinMaxVarsFunctor.
  240. void FakeQuantizeArray(const float nudged_scale, const float nudged_min,
  241. const float nudged_max, const float* input_data,
  242. float* output_data, const float size);
  243. // If x is approximately a power of two (with any positive or negative
  244. // exponent), stores that exponent (i.e. log2(x)) in *log2_result, otherwise
  245. // returns false.
  246. bool CheckedLog2(const float x, int* log2_result);
  247. // Decomposes an array of double multipliers into a Q0.31 int32 representation
  248. // of its significand, and shift representation of its exponent.
  249. //
  250. // Handles an arbitrary multiplier. The 'shift' output-value is
  251. // basically the 'floating-point exponent' of the multiplier:
  252. // Negative for a right-shift (when the multiplier is <1), positive for a
  253. // left-shift (when the multiplier is >1)
  254. void QuantizeMultiplierArray(const double* effective_scales, size_t size,
  255. int32_t* effective_scale_significand,
  256. int* effective_shift);
  257. } // namespace tflite
  258. #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_