softmax.h 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228
  1. /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
  13. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
  14. #include <limits>
  15. #include <vector>
  16. #include "fixedpoint/fixedpoint.h"
  17. #include "tensorflow/lite/kernels/internal/common.h"
  18. #include "tensorflow/lite/kernels/internal/cppmath.h"
  19. #include "tensorflow/lite/kernels/internal/quantization_util.h"
  20. #include "tensorflow/lite/kernels/internal/types.h"
  21. #include "tensorflow/lite/kernels/op_macros.h"
  22. namespace tflite {
  23. namespace reference_ops {
  24. inline void Softmax(const SoftmaxParams& params,
  25. const RuntimeShape& input_shape, const float* input_data,
  26. const RuntimeShape& output_shape, float* output_data) {
  27. const int trailing_dim = input_shape.DimensionsCount() - 1;
  28. const int outer_size =
  29. MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  30. const int depth =
  31. MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  32. for (int i = 0; i < outer_size; ++i) {
  33. // Find max element value which we'll use to ensure numerical stability
  34. // taking advantage of the following equality:
  35. // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
  36. float max = std::numeric_limits<float>::lowest();
  37. for (int c = 0; c < depth; ++c) {
  38. max = std::max(max, input_data[i * depth + c]);
  39. }
  40. // Compute sum.
  41. float sum = 0.f;
  42. for (int c = 0; c < depth; ++c) {
  43. sum += std::exp((input_data[i * depth + c] - max) *
  44. static_cast<float>(params.beta));
  45. }
  46. // Compute result.
  47. for (int c = 0; c < depth; ++c) {
  48. output_data[i * depth + c] = std::exp((input_data[i * depth + c] - max) *
  49. static_cast<float>(params.beta)) /
  50. sum;
  51. }
  52. }
  53. }
  54. // Quantized softmax with int8_t/uint8_t input and int8_t/uint8_t/int16_t
  55. // output.
  56. template <typename InputT, typename OutputT>
  57. inline void Softmax(const SoftmaxParams& params,
  58. const RuntimeShape& input_shape, const InputT* input_data,
  59. const RuntimeShape& output_shape, OutputT* output_data) {
  60. const int32_t input_beta_multiplier = params.input_multiplier;
  61. const int32_t input_beta_left_shift = params.input_left_shift;
  62. const int diff_min = params.diff_min;
  63. // The representation chosen for the input to the exp() function is Q5.26.
  64. // We need to leave extra space since values that we skip might be as large as
  65. // -32 before multiplying by input_beta_multiplier, and therefore as large as
  66. // -16 afterwards. Note that exp(-8) is definitely not insignificant to
  67. // accumulation, but exp(-16) definitely is.
  68. static const int kScaledDiffIntegerBits = 5;
  69. static const int kAccumulationIntegerBits = 12;
  70. using FixedPointScaledDiff =
  71. gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
  72. using FixedPointAccum =
  73. gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
  74. using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
  75. const int trailing_dim = input_shape.DimensionsCount() - 1;
  76. const int outer_size =
  77. MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  78. const int depth =
  79. MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  80. for (int i = 0; i < outer_size; ++i) {
  81. InputT max_in_row = std::numeric_limits<InputT>::min();
  82. for (int c = 0; c < depth; ++c) {
  83. max_in_row = std::max(max_in_row, input_data[i * depth + c]);
  84. }
  85. FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
  86. for (int c = 0; c < depth; ++c) {
  87. int32_t input_diff =
  88. static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
  89. if (input_diff >= diff_min) {
  90. const int32_t input_diff_rescaled =
  91. MultiplyByQuantizedMultiplierGreaterThanOne(
  92. input_diff, input_beta_multiplier, input_beta_left_shift);
  93. const FixedPointScaledDiff scaled_diff_f8 =
  94. FixedPointScaledDiff::FromRaw(input_diff_rescaled);
  95. sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
  96. exp_on_negative_values(scaled_diff_f8));
  97. }
  98. }
  99. int num_bits_over_unit;
  100. FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
  101. sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
  102. for (int c = 0; c < depth; ++c) {
  103. int32_t input_diff =
  104. static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
  105. if (input_diff >= diff_min) {
  106. const int32_t input_diff_rescaled =
  107. MultiplyByQuantizedMultiplierGreaterThanOne(
  108. input_diff, input_beta_multiplier, input_beta_left_shift);
  109. const FixedPointScaledDiff scaled_diff_f8 =
  110. FixedPointScaledDiff::FromRaw(input_diff_rescaled);
  111. FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
  112. int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
  113. (shifted_scale * exp_in_0).raw(),
  114. num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
  115. const int32_t shifted_output =
  116. unsat_output +
  117. static_cast<int32_t>(std::numeric_limits<OutputT>::min());
  118. output_data[i * depth + c] = static_cast<OutputT>(std::max(
  119. std::min(shifted_output,
  120. static_cast<int32_t>(std::numeric_limits<OutputT>::max())),
  121. static_cast<int32_t>(std::numeric_limits<OutputT>::min())));
  122. } else {
  123. output_data[i * depth + c] = std::numeric_limits<OutputT>::min();
  124. }
  125. }
  126. }
  127. }
  128. // Quantized softmax with int16_t input and int16_t output.
  129. inline void SoftmaxInt16(const SoftmaxParams& params,
  130. const RuntimeShape& input_shape,
  131. const int16_t* input_data,
  132. const RuntimeShape& output_shape,
  133. int16_t* output_data) {
  134. const int trailing_dim = input_shape.DimensionsCount() - 1;
  135. const int outer_size =
  136. MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  137. const int depth =
  138. MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  139. for (int i = 0; i < outer_size; ++i) {
  140. // Find the largest element
  141. int16_t max_in_row = std::numeric_limits<int16_t>::min();
  142. for (int c = 0; c < depth; ++c) {
  143. max_in_row = std::max(max_in_row, input_data[i * depth + c]);
  144. }
  145. // Compute exp(input - max_input)
  146. std::vector<int16_t> exp_result_Q015(depth);
  147. for (int c = 0; c < depth; ++c) {
  148. int32_t input_diff = input_data[i * depth + c] - max_in_row;
  149. // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
  150. int32_t scaled_diff = MultiplyByQuantizedMultiplier(
  151. input_diff, params.input_multiplier, params.input_left_shift);
  152. // recenter to [-32768, 32767]
  153. int32_t sym_scaled_diff = scaled_diff + 32767;
  154. int16_t sat_sym_scaled_diff =
  155. std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
  156. static_cast<int32_t>(32767));
  157. // apply the exp() LUT activation function
  158. exp_result_Q015[c] =
  159. generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
  160. }
  161. // sum_of_exps is a Q16.15 fixed point format.
  162. int32_t sum_of_exps = 0;
  163. for (int c = 0; c < depth; ++c) {
  164. // Q16.15 + Q0.15
  165. sum_of_exps += exp_result_Q015[c];
  166. }
  167. // Compute the reciprocal 1/sum_of_exps
  168. uint8_t headroom_plus_one =
  169. CountLeadingZeros(static_cast<uint32_t>(sum_of_exps));
  170. int32_t shifted_sum =
  171. ((static_cast<int64_t>(sum_of_exps) << (headroom_plus_one - 1)) +
  172. (1 << 13)) >>
  173. 14;
  174. // since the LUT computes 1/(1 + x) we need to first compute x = (sum - 1).
  175. // also, the LUT expects a symmetrical input, so we must also recenter x
  176. // from [0, 65535] to [-32768, 32767].
  177. int32_t sym_shifted_sum = shifted_sum + (-((1 << 15) + (1 << 16)));
  178. int16_t sat_sym_shifted_sum = static_cast<int16_t>(
  179. std::min(std::max(sym_shifted_sum, static_cast<int32_t>(-32768)),
  180. static_cast<int32_t>(32767)));
  181. // apply 1/(1 + x) LUT activation function
  182. int16_t reciprocal_scale_Q015 = generic_int16_table_lookup(
  183. sat_sym_shifted_sum, params.one_over_one_plus_x_lut);
  184. // Rescale the exp_result with reciprocal
  185. // range of output is [0, 32767] correspond to [0.0, 1.0]
  186. for (int c = 0; c < depth; ++c) {
  187. uint8_t right_shift = 31 - headroom_plus_one;
  188. int64_t round = 1 << (right_shift - 1);
  189. int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
  190. static_cast<int64_t>(reciprocal_scale_Q015) +
  191. round) >>
  192. right_shift;
  193. output_data[i * depth + c] = static_cast<int16_t>(
  194. std::min(std::max(result, static_cast<int32_t>(0)),
  195. static_cast<int32_t>(32767)));
  196. }
  197. }
  198. }
  199. } // namespace reference_ops
  200. } // namespace tflite
  201. #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_