fully_connected.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2. Licensed under the Apache License, Version 2.0 (the "License");
  3. you may not use this file except in compliance with the License.
  4. You may obtain a copy of the License at
  5. http://www.apache.org/licenses/LICENSE-2.0
  6. Unless required by applicable law or agreed to in writing, software
  7. distributed under the License is distributed on an "AS IS" BASIS,
  8. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  9. See the License for the specific language governing permissions and
  10. limitations under the License.
  11. ==============================================================================*/
  12. #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
  13. #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
  14. #include "tflite/kernels/internal/common.h"
  15. #include "tflite/kernels/internal/cppmath.h"
  16. #include "tflite/kernels/internal/quantization_util.h"
  17. #include "tflite/kernels/internal/types.h"
  18. namespace tflite {
  19. namespace reference_ops {
  20. inline void FullyConnected(
  21. const FullyConnectedParams& params, const RuntimeShape& input_shape,
  22. const float* input_data, const RuntimeShape& weights_shape,
  23. const float* weights_data, const RuntimeShape& bias_shape,
  24. const float* bias_data, const RuntimeShape& output_shape,
  25. float* output_data) {
  26. const float output_activation_min = params.float_activation_min;
  27. const float output_activation_max = params.float_activation_max;
  28. // TODO(benoitjacob): This really should be:
  29. // const int batches = ArraySize(output_dims, 1);
  30. // but the current --variable_batch hack consists in overwriting the 3rd
  31. // dimension with the runtime batch size, as we don't keep track for each
  32. // array of which dimension is the batch dimension in it.
  33. const int output_dims_count = output_shape.DimensionsCount();
  34. const int weights_dims_count = weights_shape.DimensionsCount();
  35. const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
  36. const int output_depth = MatchingDim(weights_shape, weights_dims_count - 2,
  37. output_shape, output_dims_count - 1);
  38. const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
  39. for (int b = 0; b < batches; ++b) {
  40. for (int out_c = 0; out_c < output_depth; ++out_c) {
  41. float total = 0.f;
  42. for (int d = 0; d < accum_depth; ++d) {
  43. total += input_data[b * accum_depth + d] *
  44. weights_data[out_c * accum_depth + d];
  45. }
  46. float bias_value = 0.0f;
  47. if (bias_data) {
  48. bias_value = bias_data[out_c];
  49. }
  50. output_data[out_c + output_depth * b] = ActivationFunctionWithMinMax(
  51. total + bias_value, output_activation_min, output_activation_max);
  52. }
  53. }
  54. }
  55. inline void FullyConnected(
  56. const FullyConnectedParams& params, const RuntimeShape& input_shape,
  57. const uint8* input_data, const RuntimeShape& filter_shape,
  58. const uint8* filter_data, const RuntimeShape& bias_shape,
  59. const int32* bias_data, const RuntimeShape& output_shape,
  60. uint8* output_data) {
  61. const int32 input_offset = params.input_offset;
  62. const int32 filter_offset = params.weights_offset;
  63. const int32 output_offset = params.output_offset;
  64. const int32 output_multiplier = params.output_multiplier;
  65. const int output_shift = params.output_shift;
  66. const int32 output_activation_min = params.quantized_activation_min;
  67. const int32 output_activation_max = params.quantized_activation_max;
  68. TFLITE_DCHECK_GE(filter_shape.DimensionsCount(), 2);
  69. TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
  70. TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  71. // TODO(benoitjacob): This really should be:
  72. // const int batches = ArraySize(output_dims, 1);
  73. // but the current --variable_batch hack consists in overwriting the 3rd
  74. // dimension with the runtime batch size, as we don't keep track for each
  75. // array of which dimension is the batch dimension in it.
  76. const int output_dim_count = output_shape.DimensionsCount();
  77. const int filter_dim_count = filter_shape.DimensionsCount();
  78. const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
  79. const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
  80. output_shape, output_dim_count - 1);
  81. const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  82. for (int b = 0; b < batches; ++b) {
  83. for (int out_c = 0; out_c < output_depth; ++out_c) {
  84. int32 acc = 0;
  85. for (int d = 0; d < accum_depth; ++d) {
  86. int32 input_val = input_data[b * accum_depth + d];
  87. int32 filter_val = filter_data[out_c * accum_depth + d];
  88. acc += (filter_val + filter_offset) * (input_val + input_offset);
  89. }
  90. if (bias_data) {
  91. acc += bias_data[out_c];
  92. }
  93. acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
  94. acc += output_offset;
  95. acc = std::max(acc, output_activation_min);
  96. acc = std::min(acc, output_activation_max);
  97. output_data[out_c + output_depth * b] = static_cast<uint8>(acc);
  98. }
  99. }
  100. }
  101. inline void FullyConnected(
  102. const FullyConnectedParams& params, const RuntimeShape& input_shape,
  103. const uint8* input_data, const RuntimeShape& filter_shape,
  104. const uint8* filter_data, const RuntimeShape& bias_shape,
  105. const int32* bias_data, const RuntimeShape& output_shape,
  106. int16* output_data) {
  107. const int32 input_offset = params.input_offset;
  108. const int32 filter_offset = params.weights_offset;
  109. const int32 output_offset = params.output_offset;
  110. const int32 output_multiplier = params.output_multiplier;
  111. const int output_shift = params.output_shift;
  112. const int32 output_activation_min = params.quantized_activation_min;
  113. const int32 output_activation_max = params.quantized_activation_max;
  114. TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  115. TFLITE_DCHECK_EQ(output_offset, 0);
  116. // TODO(benoitjacob): This really should be:
  117. // const int batches = ArraySize(output_dims, 1);
  118. // but the current --variable_batch hack consists in overwriting the 3rd
  119. // dimension with the runtime batch size, as we don't keep track for each
  120. // array of which dimension is the batch dimension in it.
  121. const int output_dim_count = output_shape.DimensionsCount();
  122. const int filter_dim_count = filter_shape.DimensionsCount();
  123. const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
  124. const int output_depth = MatchingDim(filter_shape, filter_dim_count - 2,
  125. output_shape, output_dim_count - 1);
  126. const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
  127. for (int b = 0; b < batches; ++b) {
  128. for (int out_c = 0; out_c < output_depth; ++out_c) {
  129. // Internal accumulation.
  130. // Initialize accumulator with the bias-value.
  131. int32 accum = bias_data[out_c];
  132. // Accumulation loop.
  133. for (int d = 0; d < accum_depth; ++d) {
  134. int16 input_val = input_data[b * accum_depth + d] + input_offset;
  135. int16 filter_val = filter_data[out_c * accum_depth + d] + filter_offset;
  136. accum += filter_val * input_val;
  137. }
  138. // Down-scale the final int32 accumulator to the scale used by our
  139. // (16-bit, typically 3 integer bits) fixed-point format. The quantized
  140. // multiplier and shift here have been pre-computed offline
  141. // (e.g. by toco).
  142. accum =
  143. MultiplyByQuantizedMultiplier(accum, output_multiplier, output_shift);
  144. // Saturate, cast to int16, and store to output array.
  145. accum = std::max(accum, output_activation_min - output_offset);
  146. accum = std::min(accum, output_activation_max - output_offset);
  147. accum += output_offset;
  148. output_data[out_c + output_depth * b] = accum;
  149. }
  150. }
  151. }
  152. inline void ShuffledFullyConnected(
  153. const FullyConnectedParams& params, const RuntimeShape& input_shape,
  154. const uint8* input_data, const RuntimeShape& weights_shape,
  155. const uint8* shuffled_weights_data, const RuntimeShape& bias_shape,
  156. const int32* bias_data, const RuntimeShape& output_shape,
  157. int16* output_data, uint8* shuffled_input_workspace_data) {
  158. const int32 output_multiplier = params.output_multiplier;
  159. const int output_shift = params.output_shift;
  160. const int32 output_activation_min = params.quantized_activation_min;
  161. const int32 output_activation_max = params.quantized_activation_max;
  162. TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
  163. TFLITE_DCHECK_GE(input_shape.DimensionsCount(), 1);
  164. TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
  165. TFLITE_DCHECK_GE(output_shape.DimensionsCount(), 1);
  166. // TODO(benoitjacob): This really should be:
  167. // const int batches = ArraySize(output_dims, 1);
  168. // but the current --variable_batch hack consists in overwriting the 3rd
  169. // dimension with the runtime batch size, as we don't keep track for each
  170. // array of which dimension is the batch dimension in it.
  171. const int output_dim_count = output_shape.DimensionsCount();
  172. const int weights_dim_count = weights_shape.DimensionsCount();
  173. const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
  174. const int output_depth = MatchingDim(weights_shape, weights_dim_count - 2,
  175. output_shape, output_dim_count - 1);
  176. const int accum_depth = weights_shape.Dims(weights_dim_count - 1);
  177. TFLITE_DCHECK((accum_depth % 16) == 0);
  178. TFLITE_DCHECK((output_depth % 4) == 0);
  179. // Shuffling and xoring of input activations into the workspace buffer
  180. uint8* shuffled_input_workspace_ptr = shuffled_input_workspace_data;
  181. if (batches == 1) {
  182. for (int i = 0; i < accum_depth; i++) {
  183. shuffled_input_workspace_data[i] = input_data[i] ^ 0x80;
  184. }
  185. } else if (batches == 4) {
  186. for (int c = 0; c < accum_depth; c += 16) {
  187. for (int b = 0; b < 4; b++) {
  188. const uint8* src_data_ptr = input_data + b * accum_depth + c;
  189. for (int j = 0; j < 16; j++) {
  190. uint8 src_val = *src_data_ptr++;
  191. // Flip the sign bit, so that the kernel will only need to
  192. // reinterpret these uint8 values as int8, getting for free the
  193. // subtraction of the zero_point value 128.
  194. uint8 dst_val = src_val ^ 0x80;
  195. *shuffled_input_workspace_ptr++ = dst_val;
  196. }
  197. }
  198. }
  199. } else {
  200. TFLITE_DCHECK(false);
  201. return;
  202. }
  203. // Actual computation
  204. if (batches == 1) {
  205. int16* output_ptr = output_data;
  206. // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
  207. // so that just reinterpreting them as int8 values is equivalent to
  208. // subtracting 128 from them, thus implementing for free the subtraction of
  209. // the zero_point value 128.
  210. const int8* shuffled_weights_ptr =
  211. reinterpret_cast<const int8*>(shuffled_weights_data);
  212. // Likewise, we preshuffled and pre-xored the input data above.
  213. const int8* shuffled_input_data =
  214. reinterpret_cast<const int8*>(shuffled_input_workspace_data);
  215. for (int c = 0; c < output_depth; c += 4) {
  216. // Internal accumulation.
  217. // Initialize accumulator with the bias-value.
  218. int32 accum[4] = {0};
  219. // Accumulation loop.
  220. for (int d = 0; d < accum_depth; d += 16) {
  221. for (int i = 0; i < 4; i++) {
  222. for (int j = 0; j < 16; j++) {
  223. int8 input_val = shuffled_input_data[d + j];
  224. int8 weights_val = *shuffled_weights_ptr++;
  225. accum[i] += weights_val * input_val;
  226. }
  227. }
  228. }
  229. for (int i = 0; i < 4; i++) {
  230. // Add bias value
  231. int32 acc = accum[i] + bias_data[c + i];
  232. // Down-scale the final int32 accumulator to the scale used by our
  233. // (16-bit, typically 3 integer bits) fixed-point format. The quantized
  234. // multiplier and shift here have been pre-computed offline
  235. // (e.g. by toco).
  236. acc =
  237. MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
  238. // Saturate, cast to int16, and store to output array.
  239. acc = std::max(acc, output_activation_min);
  240. acc = std::min(acc, output_activation_max);
  241. output_ptr[c + i] = acc;
  242. }
  243. }
  244. } else if (batches == 4) {
  245. int16* output_ptr = output_data;
  246. // Shuffled weights have had their sign bit (0x80) pre-flipped (xor'd)
  247. // so that just reinterpreting them as int8 values is equivalent to
  248. // subtracting 128 from them, thus implementing for free the subtraction of
  249. // the zero_point value 128.
  250. const int8* shuffled_weights_ptr =
  251. reinterpret_cast<const int8*>(shuffled_weights_data);
  252. // Likewise, we preshuffled and pre-xored the input data above.
  253. const int8* shuffled_input_data =
  254. reinterpret_cast<const int8*>(shuffled_input_workspace_data);
  255. for (int c = 0; c < output_depth; c += 4) {
  256. const int8* shuffled_input_ptr = shuffled_input_data;
  257. // Accumulation loop.
  258. // Internal accumulation.
  259. // Initialize accumulator with the bias-value.
  260. int32 accum[4][4];
  261. for (int i = 0; i < 4; i++) {
  262. for (int b = 0; b < 4; b++) {
  263. accum[i][b] = 0;
  264. }
  265. }
  266. for (int d = 0; d < accum_depth; d += 16) {
  267. for (int i = 0; i < 4; i++) {
  268. for (int b = 0; b < 4; b++) {
  269. for (int j = 0; j < 16; j++) {
  270. int8 input_val = shuffled_input_ptr[16 * b + j];
  271. int8 weights_val = shuffled_weights_ptr[16 * i + j];
  272. accum[i][b] += weights_val * input_val;
  273. }
  274. }
  275. }
  276. shuffled_input_ptr += 64;
  277. shuffled_weights_ptr += 64;
  278. }
  279. for (int i = 0; i < 4; i++) {
  280. for (int b = 0; b < 4; b++) {
  281. // Add bias value
  282. int32 acc = accum[i][b] + bias_data[c + i];
  283. // Down-scale the final int32 accumulator to the scale used by our
  284. // (16-bit, typically 3 integer bits) fixed-point format. The
  285. // quantized multiplier and shift here have been pre-computed offline
  286. // (e.g. by toco).
  287. acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
  288. output_shift);
  289. // Saturate, cast to int16, and store to output array.
  290. acc = std::max(acc, output_activation_min);
  291. acc = std::min(acc, output_activation_max);
  292. output_ptr[b * output_depth + c + i] = acc;
  293. }
  294. }
  295. }
  296. } else {
  297. TFLITE_DCHECK(false);
  298. return;
  299. }
  300. }
  301. } // namespace reference_ops
  302. } // namespace tflite
  303. #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_