| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
- #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
- #include "ruy/profiler/instrumentation.h" // from @ruy
- #include "tensorflow/lite/kernels/internal/common.h"
- #include "tensorflow/lite/kernels/internal/cppmath.h"
- #include "tensorflow/lite/kernels/internal/max.h"
- #include "tensorflow/lite/kernels/internal/min.h"
- #include "tensorflow/lite/kernels/internal/quantization_util.h"
- #include "tensorflow/lite/kernels/internal/types.h"
- namespace tflite {
- namespace reference_ops {
- // A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
- // This method iterates through input data and reduce elements along the
- // dimensions given in axis.
- template <typename In, typename Out>
- inline bool Reduce(const In* input_data, const int* input_dims,
- const int* output_dims, const int input_num_dims,
- const int output_num_dims, const int* axis,
- const int num_axis, int* input_iter,
- Out reducer(const Out current, const In in),
- Out* output_data) {
- // Reset input iterator.
- for (int idx = 0; idx < input_num_dims; ++idx) {
- input_iter[idx] = 0;
- }
- // Iterate through input_data.
- do {
- size_t input_offset =
- ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
- size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
- input_iter, num_axis, axis);
- output_data[output_offset] =
- reducer(output_data[output_offset], input_data[input_offset]);
- } while (NextIndex(input_num_dims, input_dims, input_iter));
- return true;
- }
- // This method parses the input 'axis' to remove duplicates and handle negative
- // values, and returns a valid 'out_axis'
- inline bool ResolveAxis(const int num_dims, const int* axis,
- const int64_t num_axis, int* out_axis,
- int* out_num_axis) {
- *out_num_axis = 0; // Just in case.
- // Short-circuit axis resolution for scalars; the axis will go unused.
- if (num_dims == 0) {
- return true;
- }
- // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
- for (int64_t idx = 0; idx < num_axis; ++idx) {
- // Handle negative index. A positive index 'p_idx' can be represented as a
- // negative index 'n_idx' as: n_idx = p_idx-num_dims
- // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1] */
- int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
- TFLITE_DCHECK(current >= 0 && current < num_dims);
- bool is_dup = false;
- for (int j = 0; j < *out_num_axis; ++j) {
- if (out_axis[j] == current) {
- is_dup = true;
- break;
- }
- }
- if (!is_dup) {
- out_axis[*out_num_axis] = current;
- *out_num_axis += 1;
- }
- }
- return true;
- }
- // This method expects that output_data has been initialized.
- template <typename In, typename Out>
- inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
- const int* output_dims, const int input_num_dims,
- const int output_num_dims, const int* axis,
- const int num_axis, int* input_iter,
- Out* output_data) {
- auto reducer = [](const Out current, const In in) -> Out {
- const Out actual_in = static_cast<Out>(in);
- return current + actual_in;
- };
- return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
- output_num_dims, axis, num_axis, input_iter, reducer,
- output_data);
- }
- template <typename T>
- inline bool InitTensorDataForReduce(const int* dims, const int num_dims,
- const T init_value, T* data) {
- size_t num_elements = 1;
- for (int idx = 0; idx < num_dims; ++idx) {
- size_t current = static_cast<size_t>(dims[idx]);
- // Overflow prevention.
- if (num_elements > std::numeric_limits<size_t>::max() / current) {
- return false;
- }
- num_elements *= current;
- }
- for (size_t idx = 0; idx < num_elements; ++idx) {
- data[idx] = init_value;
- }
- return true;
- }
- // Computes the generic value (i.e., sum/max/min/prod) of elements across
- // dimensions given in axis. It needs to pass in init_value and reducer.
- template <typename T>
- inline bool ReduceGeneric(const T* input_data, const int* input_dims,
- const int input_num_dims, T* output_data,
- const int* output_dims, const int output_num_dims,
- const int* axis, const int64_t num_axis_dimensions,
- bool keep_dims, int* temp_index, int* resolved_axis,
- T init_value,
- T reducer(const T current, const T in)) {
- // Reset output data.
- if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
- output_data)) {
- return false;
- }
- // Resolve axis.
- int num_resolved_axis = 0;
- if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
- &num_resolved_axis)) {
- return false;
- }
- return Reduce<T, T>(input_data, input_dims, output_dims, input_num_dims,
- output_num_dims, resolved_axis, num_resolved_axis,
- temp_index, reducer, output_data);
- }
- // Computes the mean of elements across dimensions given in axis.
- // It does so in two stages, first calculates the sum of elements along the axis
- // then divides it by the number of element in axis.
- template <typename T, typename U>
- inline bool Mean(const T* input_data, const int* input_dims,
- const int input_num_dims, T* output_data,
- const int* output_dims, const int output_num_dims,
- const int* axis, const int num_axis_dimensions, bool keep_dims,
- int* temp_index, int* resolved_axis, U* temp_sum) {
- ruy::profiler::ScopeLabel label("Mean");
- // Reset output data.
- size_t num_outputs = 1;
- for (int idx = 0; idx < output_num_dims; ++idx) {
- size_t current = static_cast<size_t>(output_dims[idx]);
- // Overflow prevention.
- if (num_outputs > std::numeric_limits<size_t>::max() / current) {
- return false;
- }
- num_outputs *= current;
- }
- for (size_t idx = 0; idx < num_outputs; ++idx) {
- output_data[idx] = T();
- temp_sum[idx] = U();
- }
- // Resolve axis.
- int num_resolved_axis = 0;
- if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
- &num_resolved_axis)) {
- return false;
- }
- if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
- output_num_dims, resolved_axis, num_resolved_axis,
- temp_index, temp_sum)) {
- return false;
- }
- // Calculate mean by dividing output_data by num of aggregated element.
- U num_elements_in_axis = 1;
- for (int idx = 0; idx < num_resolved_axis; ++idx) {
- size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
- // Overflow prevention.
- if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
- return false;
- }
- num_elements_in_axis *= current;
- }
- if (num_elements_in_axis > 0) {
- for (size_t idx = 0; idx < num_outputs; ++idx) {
- output_data[idx] =
- static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
- }
- }
- return true;
- }
- template <typename T>
- inline void Mean(const tflite::MeanParams& op_params,
- const RuntimeShape& unextended_input_shape,
- const T* input_data,
- const RuntimeShape& unextended_output_shape, T* output_data) {
- ruy::profiler::ScopeLabel label("Mean4D");
- // Current implementation only supports dimension equals 4 and simultaneous
- // reduction over width and height.
- TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
- TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
- const RuntimeShape input_shape =
- RuntimeShape::ExtendedShape(4, unextended_input_shape);
- const RuntimeShape output_shape =
- RuntimeShape::ExtendedShape(4, unextended_output_shape);
- const int output_batch = output_shape.Dims(0);
- const int output_height = output_shape.Dims(1);
- const int output_width = output_shape.Dims(2);
- const int output_depth = output_shape.Dims(3);
- const int input_height = input_shape.Dims(1);
- const int input_width = input_shape.Dims(2);
- TFLITE_CHECK_EQ(op_params.axis_count, 2);
- TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
- (op_params.axis[0] == 2 && op_params.axis[1] == 1));
- TFLITE_CHECK_EQ(output_height, 1);
- TFLITE_CHECK_EQ(output_width, 1);
- for (int out_b = 0; out_b < output_batch; ++out_b) {
- for (int out_d = 0; out_d < output_depth; ++out_d) {
- float value = 0;
- for (int in_h = 0; in_h < input_height; ++in_h) {
- for (int in_w = 0; in_w < input_width; ++in_w) {
- value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
- }
- }
- output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
- value / (input_width * input_height);
- }
- }
- }
- inline void Mean(const tflite::MeanParams& op_params,
- const RuntimeShape& unextended_input_shape,
- const uint8_t* input_data, int32_t input_zero_point,
- float input_scale, const RuntimeShape& unextended_output_shape,
- uint8_t* output_data, int32_t output_zero_point,
- float output_scale) {
- ruy::profiler::ScopeLabel label("Mean4D/Uint8");
- // Current implementation only supports dimension equals 4 and simultaneous
- // reduction over width and height.
- TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
- TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
- const RuntimeShape input_shape =
- RuntimeShape::ExtendedShape(4, unextended_input_shape);
- const RuntimeShape output_shape =
- RuntimeShape::ExtendedShape(4, unextended_output_shape);
- const int output_batch = output_shape.Dims(0);
- const int output_height = output_shape.Dims(1);
- const int output_width = output_shape.Dims(2);
- const int output_depth = output_shape.Dims(3);
- const int input_height = input_shape.Dims(1);
- const int input_width = input_shape.Dims(2);
- const float num_elements_in_axis = input_width * input_height;
- TFLITE_CHECK_EQ(op_params.axis_count, 2);
- TFLITE_CHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
- (op_params.axis[0] == 2 && op_params.axis[1] == 1));
- TFLITE_CHECK_EQ(output_height, 1);
- TFLITE_CHECK_EQ(output_width, 1);
- constexpr int32_t kMinValue = std::numeric_limits<uint8_t>::min();
- constexpr int32_t kMaxValue = std::numeric_limits<uint8_t>::max();
- int32_t bias =
- output_zero_point -
- static_cast<int32_t>(input_zero_point * input_scale / output_scale);
- double real_scale =
- static_cast<double>(input_scale / (num_elements_in_axis * output_scale));
- int32_t multiplier;
- int shift;
- QuantizeMultiplier(real_scale, &multiplier, &shift);
- for (int out_b = 0; out_b < output_batch; ++out_b) {
- for (int out_d = 0; out_d < output_depth; ++out_d) {
- int32_t acc = 0;
- for (int in_h = 0; in_h < input_height; ++in_h) {
- for (int in_w = 0; in_w < input_width; ++in_w) {
- acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
- }
- }
- acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
- acc += bias;
- acc = std::min(std::max(acc, kMinValue), kMaxValue);
- output_data[Offset(output_shape, out_b, 0, 0, out_d)] =
- static_cast<uint8_t>(acc);
- }
- }
- }
- // Computes the mean of elements across dimensions given in axis.
- // It does so in two stages, first calculates the sum of elements along the axis
- // then divides it by the number of element in axis for quantized values.
- template <typename T, typename U>
- inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
- float input_scale, const int* input_dims,
- const int input_num_dims, T* output_data,
- int32_t output_zero_point, float output_scale,
- const int* output_dims,
- const int output_num_dims, const int* axis,
- const int num_axis_dimensions, bool keep_dims,
- int* temp_index, int* resolved_axis, U* temp_sum,
- bool compute_sum) {
- const bool uint8_case = std::is_same<T, uint8_t>::value;
- const bool int16_case = std::is_same<T, int16_t>::value;
- if (uint8_case) {
- ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Uint8" : "Mean/Uint8");
- } else if (int16_case) {
- ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int16" : "Mean/Int16");
- } else {
- ruy::profiler::ScopeLabel label(compute_sum ? "Sum/Int8" : "Mean/Int8");
- }
- // Reset output data.
- size_t num_outputs = 1;
- for (int idx = 0; idx < output_num_dims; ++idx) {
- size_t current = static_cast<size_t>(output_dims[idx]);
- // Overflow prevention.
- if (num_outputs > std::numeric_limits<size_t>::max() / current) {
- return false;
- }
- num_outputs *= current;
- }
- for (size_t idx = 0; idx < num_outputs; ++idx) {
- output_data[idx] = T();
- temp_sum[idx] = U();
- }
- // Resolve axis.
- int num_resolved_axis = 0;
- if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
- &num_resolved_axis)) {
- return false;
- }
- if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
- output_num_dims, resolved_axis, num_resolved_axis,
- temp_index, temp_sum)) {
- return false;
- }
- // Calculate mean by dividing output_data by num of aggregated element.
- U num_elements_in_axis = 1;
- for (int idx = 0; idx < num_resolved_axis; ++idx) {
- size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
- // Overflow prevention.
- if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
- return false;
- }
- num_elements_in_axis *= current;
- }
- if (num_elements_in_axis > 0) {
- const float scale = input_scale / output_scale;
- if (compute_sum) {
- // TODO(b/116341117): Eliminate float and do this completely in 8bit.
- const float bias =
- -input_zero_point * scale * num_elements_in_axis + 0.5f;
- for (size_t idx = 0; idx < num_outputs; ++idx) {
- const U value =
- static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
- output_zero_point;
- output_data[idx] = static_cast<T>(value);
- }
- } else {
- const float bias = -input_zero_point * scale + 0.5f;
- for (size_t idx = 0; idx < num_outputs; ++idx) {
- float float_mean = static_cast<float>(temp_sum[idx]) /
- static_cast<float>(num_elements_in_axis);
- float result = TfLiteMin(
- TfLiteRound(float_mean * scale + bias) + output_zero_point,
- static_cast<float>(std::numeric_limits<T>::max()));
- result = TfLiteMax(result,
- static_cast<float>(std::numeric_limits<T>::min()));
- output_data[idx] = static_cast<T>(result);
- }
- }
- }
- return true;
- }
- } // namespace reference_ops
- } // namespace tflite
- #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_REDUCE_H_
|