| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320 |
- /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==============================================================================*/
- #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
- #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
- #include "tensorflow/lite/kernels/internal/common.h"
- #include "tensorflow/lite/kernels/internal/cppmath.h"
- #include "tensorflow/lite/kernels/internal/quantization_util.h"
- #include "tensorflow/lite/kernels/internal/types.h"
- namespace tflite {
- namespace reference_ops {
- inline void FullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const float* input_data, const RuntimeShape& weights_shape,
- const float* weights_data, const RuntimeShape& bias_shape,
- const float* bias_data, const RuntimeShape& output_shape,
- float* output_data) {
- const float output_activation_min = params.float_activation_min;
- const float output_activation_max = params.float_activation_max;
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dims_count = output_shape.DimensionsCount();
- const int weights_dims_count = weights_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
- const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
- output_shape, output_dims_count - 1);
- const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
- for (int b = 0; b < batches; ++b) {
- for (int out_c = 0; out_c < output_depth; ++out_c) {
- float total = 0.f;
- for (int d = 0; d < accum_depth; ++d) {
- total += input_data[b * accum_depth + d] *
- weights_data[out_c * accum_depth + d];
- }
- float bias_value = 0.0f;
- if (bias_data) {
- bias_value = bias_data[out_c];
- }
- output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
- total + bias_value, output_activation_min, output_activation_max);
- }
- }
- }
- inline void FullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const uint8_t* input_data, const RuntimeShape& filter_shape,
- const uint8_t* filter_data, const RuntimeShape& bias_shape,
- const int32_t* bias_data, const RuntimeShape& output_shape,
- uint8_t* output_data) {
- const int32_t input_offset = params.input_offset;
- const int32_t filter_offset = params.weights_offset;
- const int32_t output_offset = params.output_offset;
- const int32_t output_multiplier = params.output_multiplier;
- const int output_shift = params.output_shift;
- const int32_t output_activation_min = params.quantized_activation_min;
- const int32_t output_activation_max = params.quantized_activation_max;
- TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
- TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dim_count = output_shape.DimensionsCount();
- const int filter_dim_count = filter_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
- const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
- output_shape, output_dim_count - 1);
- const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
- for (int b = 0; b < batches; ++b) {
- for (int out_c = 0; out_c < output_depth; ++out_c) {
- int32_t acc = 0;
- for (int d = 0; d < accum_depth; ++d) {
- int32_t input_val = input_data[b * accum_depth + d];
- int32_t filter_val = filter_data[out_c * accum_depth + d];
- acc += (filter_val + filter_offset) * (input_val + input_offset);
- }
- if (bias_data) {
- acc += bias_data[out_c];
- }
- acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
- acc += output_offset;
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
- }
- }
- }
- inline void FullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const uint8_t* input_data, const RuntimeShape& filter_shape,
- const uint8_t* filter_data, const RuntimeShape& bias_shape,
- const int32_t* bias_data, const RuntimeShape& output_shape,
- int16_t* output_data) {
- const int32_t input_offset = params.input_offset;
- const int32_t filter_offset = params.weights_offset;
- const int32_t output_offset = params.output_offset;
- const int32_t output_multiplier = params.output_multiplier;
- const int output_shift = params.output_shift;
- const int32_t output_activation_min = params.quantized_activation_min;
- const int32_t output_activation_max = params.quantized_activation_max;
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- TFLITE_DCHECK_EQ(output_offset, 0);
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dim_count = output_shape.DimensionsCount();
- const int filter_dim_count = filter_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
- const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
- output_shape, output_dim_count - 1);
- const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
- for (int b = 0; b < batches; ++b) {
- for (int out_c = 0; out_c < output_depth; ++out_c) {
- // Internal accumulation.
- // Initialize accumulator with the bias-value.
- int32_t accum = bias_data[out_c];
- // Accumulation loop.
- for (int d = 0; d < accum_depth; ++d) {
- int16_t input_val = input_data[b * accum_depth + d] + input_offset;
- int16_t filter_val =
- filter_data[out_c * accum_depth + d] + filter_offset;
- accum += filter_val * input_val;
- }
- // Down-scale the final int32_t accumulator to the scale used by our
- // (16-bit, typically 3 integer bits) fixed-point format. The quantized
- // multiplier and shift here have been pre-computed offline
- // (e.g. by toco).
- accum =
- MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
- // Saturate, cast to int16_t, and store to output array.
- accum = std::max(accum, output_activation_min - output_offset);
- accum = std::min(accum, output_activation_max - output_offset);
- accum += output_offset;
- output_data[out_c + output_depth * b] = accum;
- }
- }
- }
- inline void ShuffledFullyConnected(
- const FullyConnectedParams& params, const RuntimeShape& input_shape,
- const uint8_t* input_data, const RuntimeShape& weights_shape,
- const uint8_t* shuffled_weights_data, const RuntimeShape& bias_shape,
- const int32_t* bias_data, const RuntimeShape& output_shape,
- int16_t* output_data, uint8_t* shuffled_input_workspace_data) {
- const int32_t output_multiplier = params.output_multiplier;
- const int output_shift = params.output_shift;
- const int32_t output_activation_min = params.quantized_activation_min;
- const int32_t output_activation_max = params.quantized_activation_max;
- TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
- TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
- TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
- TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
- // TODO(benoitjacob): This really should be:
- // const int batches = ArraySize(output_dims, 1);
- // but the current --variable_batch hack consists in overwriting the 3rd
- // dimension with the runtime batch size, as we don't keep track for each
- // array of which dimension is the batch dimension in it.
- const int output_dim_count = output_shape.DimensionsCount();
- const int weights_dim_count = weights_shape.DimensionsCount();
- const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
- const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
- output_shape, output_dim_count - 1);
- const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
- TFLITE_DCHECK((accum_depth % 16) == 0);
- TFLITE_DCHECK((output_depth % 4) == 0);
- // Shuffling and xoring of input activations into the workspace buffer
- uint8_t* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
- if (batches == 1) {
- for (int i = 0; i < accum_depth; i++) {
- shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
- }
- } else if (batches == 4) {
- for (int c = 0; c < accum_depth; c += 16) {
- for (int b = 0; b < 4; b++) {
- const uint8_t* src_data_ptr = input_data + b * accum_depth + c;
- for (int j = 0; j < 16; j++) {
- uint8_t src_val = *src_data_ptr++;
- // Flip the sign bit, so that the kernel will only need to
- // reinterpret these uint8_t values as int8_t, getting for free the
- // subtraction of the zero_point value 128.
- uint8_t dst_val = src_val ^ 0x80;
- *shuffled_input_workspace_ptr++ = dst_val;
- }
- }
- }
- } else {
- TFLITE_DCHECK(false);
- return;
- }
- // Actual computation
- if (batches == 1) {
- int16_t* output_ptr = output_data;
- // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
- // so that just reinterpreting them as int8_t values is equivalent to
- // subtracting 128 from them, thus implementing for free the subtraction of
- // the zero_point value 128.
- const int8_t* shuffled_weights_ptr =
- reinterpret_cast<const int8_t*>(shuffled_weights_data);
- // Likewise, we preshuffled and pre-xored the input data above.
- const int8_t* shuffled_input_data =
- reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
- for (int c = 0; c < output_depth; c += 4) {
- // Internal accumulation.
- // Initialize accumulator with the bias-value.
- int32_t accum[4] = {0};
- // Accumulation loop.
- for (int d = 0; d < accum_depth; d += 16) {
- for (int i = 0; i < 4; i++) {
- for (int j = 0; j < 16; j++) {
- int8_t input_val = shuffled_input_data[d + j];
- int8_t weights_val = *shuffled_weights_ptr++;
- accum[i] += weights_val * input_val;
- }
- }
- }
- for (int i = 0; i < 4; i++) {
- // Add bias value
- int32_t acc = accum[i] + bias_data[c + i];
- // Down-scale the final int32_t accumulator to the scale used by our
- // (16-bit, typically 3 integer bits) fixed-point format. The quantized
- // multiplier and shift here have been pre-computed offline
- // (e.g. by toco).
- acc =
- MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
- // Saturate, cast to int16_t, and store to output array.
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- output_ptr[c + i] = acc;
- }
- }
- } else if (batches == 4) {
- int16_t* output_ptr = output_data;
- // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
- // so that just reinterpreting them as int8_t values is equivalent to
- // subtracting 128 from them, thus implementing for free the subtraction of
- // the zero_point value 128.
- const int8_t* shuffled_weights_ptr =
- reinterpret_cast<const int8_t*>(shuffled_weights_data);
- // Likewise, we preshuffled and pre-xored the input data above.
- const int8_t* shuffled_input_data =
- reinterpret_cast<const int8_t*>(shuffled_input_workspace_data);
- for (int c = 0; c < output_depth; c += 4) {
- const int8_t* shuffled_input_ptr = shuffled_input_data;
- // Accumulation loop.
- // Internal accumulation.
- // Initialize accumulator with the bias-value.
- int32_t accum[4][4];
- for (int i = 0; i < 4; i++) {
- for (int b = 0; b < 4; b++) {
- accum[i][b] = 0;
- }
- }
- for (int d = 0; d < accum_depth; d += 16) {
- for (int i = 0; i < 4; i++) {
- for (int b = 0; b < 4; b++) {
- for (int j = 0; j < 16; j++) {
- int8_t input_val = shuffled_input_ptr[16 * b + j];
- int8_t weights_val = shuffled_weights_ptr[16 * i + j];
- accum[i][b] += weights_val * input_val;
- }
- }
- }
- shuffled_input_ptr += 64;
- shuffled_weights_ptr += 64;
- }
- for (int i = 0; i < 4; i++) {
- for (int b = 0; b < 4; b++) {
- // Add bias value
- int32_t acc = accum[i][b] + bias_data[c + i];
- // Down-scale the final int32_t accumulator to the scale used by our
- // (16-bit, typically 3 integer bits) fixed-point format. The
- // quantized multiplier and shift here have been pre-computed offline
- // (e.g. by toco).
- acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
- output_shift);
- // Saturate, cast to int16_t, and store to output array.
- acc = std::max(acc, output_activation_min);
- acc = std::min(acc, output_activation_max);
- output_ptr[b * output_depth + c + i] = acc;
- }
- }
- }
- } else {
- TFLITE_DCHECK(false);
- return;
- }
- }
- } // namespace reference_ops
- } // namespace tflite
- #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
|