add.h 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
  13. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
  14. #include <limits>
  15. #include "tensorflow/lite/kernels/internal/common.h"
  16. #include "tensorflow/lite/kernels/internal/types.h"
  17. namespace tflite {
  18. namespace reference_integer_ops {
  19. inline void CheckArithmeticParams(const ArithmeticParams& params) {
  20. TFLITE_DCHECK_LE(params.quantized_activation_min,
  21. params.quantized_activation_max);
  22. // Input offset is negative input zero point. Activation tensors are
  23. // asymmetric quantized so they span the full int8 range.
  24. TFLITE_DCHECK_GE(-params.input1_offset, std::numeric_limits<int8_t>::min());
  25. TFLITE_DCHECK_GE(-params.input2_offset, std::numeric_limits<int8_t>::min());
  26. TFLITE_DCHECK_LE(-params.input1_offset, std::numeric_limits<int8_t>::max());
  27. TFLITE_DCHECK_LE(-params.input2_offset, std::numeric_limits<int8_t>::max());
  28. }
  29. // Element-wise add that can often be used for inner loop of broadcast add as
  30. // well as the non-broadcast add.
  31. inline void AddElementwise(int size, const ArithmeticParams& params,
  32. const int8_t* input1_data, const int8_t* input2_data,
  33. int8_t* output_data) {
  34. CheckArithmeticParams(params);
  35. for (int i = 0; i < size; ++i) {
  36. const int32_t input1_val = params.input1_offset + input1_data[i];
  37. const int32_t input2_val = params.input2_offset + input2_data[i];
  38. const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
  39. const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
  40. const int32_t scaled_input1_val =
  41. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  42. shifted_input1_val, params.input1_multiplier, params.input1_shift);
  43. const int32_t scaled_input2_val =
  44. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  45. shifted_input2_val, params.input2_multiplier, params.input2_shift);
  46. const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
  47. const int32_t raw_output =
  48. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  49. raw_sum, params.output_multiplier, params.output_shift) +
  50. params.output_offset;
  51. const int32_t clamped_output =
  52. std::min(params.quantized_activation_max,
  53. std::max(params.quantized_activation_min, raw_output));
  54. output_data[i] = static_cast<int8_t>(clamped_output);
  55. }
  56. }
  57. inline void Add(const ArithmeticParams& params,
  58. const RuntimeShape& input1_shape, const int8_t* input1_data,
  59. const RuntimeShape& input2_shape, const int8_t* input2_data,
  60. const RuntimeShape& output_shape, int8_t* output_data) {
  61. CheckArithmeticParams(params);
  62. const int flat_size =
  63. MatchingElementsSize(input1_shape, input2_shape, output_shape);
  64. AddElementwise(flat_size, params, input1_data, input2_data, output_data);
  65. }
  66. inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
  67. const RuntimeShape& input1_shape,
  68. const int8_t* input1_data,
  69. const RuntimeShape& input2_shape,
  70. const int8_t* input2_data,
  71. const RuntimeShape& output_shape,
  72. int8_t* output_data) {
  73. NdArrayDesc<4> desc1;
  74. NdArrayDesc<4> desc2;
  75. NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
  76. &desc2);
  77. const RuntimeShape extended_output_shape =
  78. RuntimeShape::ExtendedShape(4, output_shape);
  79. // In Tensorflow, the dimensions are canonically named (batch_number, row,
  80. // col, channel), with extents (batches, height, width, depth), with the
  81. // trailing dimension changing most rapidly (channels has the smallest stride,
  82. // typically 1 element).
  83. //
  84. // In generated C code, we store arrays with the dimensions reversed. The
  85. // first dimension has smallest stride.
  86. //
  87. // We name our variables by their Tensorflow convention, but generate C code
  88. // nesting loops such that the innermost loop has the smallest stride for the
  89. // best cache behavior.
  90. for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
  91. for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
  92. for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
  93. for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
  94. const int32_t input1_val =
  95. params.input1_offset +
  96. input1_data[SubscriptToIndex(desc1, b, y, x, c)];
  97. const int32_t input2_val =
  98. params.input2_offset +
  99. input2_data[SubscriptToIndex(desc2, b, y, x, c)];
  100. const int32_t shifted_input1_val =
  101. input1_val * (1 << params.left_shift);
  102. const int32_t shifted_input2_val =
  103. input2_val * (1 << params.left_shift);
  104. const int32_t scaled_input1_val =
  105. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  106. shifted_input1_val, params.input1_multiplier,
  107. params.input1_shift);
  108. const int32_t scaled_input2_val =
  109. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  110. shifted_input2_val, params.input2_multiplier,
  111. params.input2_shift);
  112. const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
  113. const int32_t raw_output =
  114. MultiplyByQuantizedMultiplierSmallerThanOneExp(
  115. raw_sum, params.output_multiplier, params.output_shift) +
  116. params.output_offset;
  117. const int32_t clamped_output =
  118. std::min(params.quantized_activation_max,
  119. std::max(params.quantized_activation_min, raw_output));
  120. output_data[Offset(extended_output_shape, b, y, x, c)] =
  121. static_cast<int8_t>(clamped_output);
  122. }
  123. }
  124. }
  125. }
  126. }
  127. } // namespace reference_integer_ops
  128. } // namespace tflite
  129. #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_