k210_ops.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /* Copyright 2019-2020 Canaan Inc.
  2. *
  3. * Licensed under the Apache License, Version 2.0 (the "License");
  4. * you may not use this file except in compliance with the License.
  5. * You may obtain a copy of the License at
  6. *
  7. * http://www.apache.org/licenses/LICENSE-2.0
  8. *
  9. * Unless required by applicable law or agreed to in writing, software
  10. * distributed under the License is distributed on an "AS IS" BASIS,
  11. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. * See the License for the specific language governing permissions and
  13. * limitations under the License.
  14. */
  15. #include <kernels/k210/k210_kernels.h>
  16. #include <runtime/k210/k210_ops_body.h>
  17. #include <runtime/kernel_registry.h>
  18. #if !NNCASE_TARGET_K210_SIMULATOR
  19. #include <dmac.h>
  20. #include <sysctl.h>
  21. #endif
  22. using namespace nncase;
  23. using namespace nncase::runtime;
  24. using namespace nncase::runtime::k210;
  25. namespace
  26. {
  27. #if !NNCASE_TARGET_K210_SIMULATOR
  28. void kpu_send_layer(const kpu_layer_argument_t &layer)
  29. {
  30. kpu->layer_argument_fifo = layer.interrupt_enabe.reg;
  31. kpu->layer_argument_fifo = layer.image_addr.reg;
  32. kpu->layer_argument_fifo = layer.image_channel_num.reg;
  33. kpu->layer_argument_fifo = layer.image_size.reg;
  34. kpu->layer_argument_fifo = layer.kernel_pool_type_cfg.reg;
  35. kpu->layer_argument_fifo = layer.kernel_load_cfg.reg;
  36. kpu->layer_argument_fifo = layer.kernel_offset.reg;
  37. kpu->layer_argument_fifo = layer.kernel_calc_type_cfg.reg;
  38. kpu->layer_argument_fifo = layer.write_back_cfg.reg;
  39. kpu->layer_argument_fifo = layer.conv_value.reg;
  40. kpu->layer_argument_fifo = layer.conv_value2.reg;
  41. kpu->layer_argument_fifo = layer.dma_parameter.reg;
  42. }
  43. void kpu_conv2d_normal(kpu_layer_argument_t &layer, plic_irq_callback_t callback, void *userdata)
  44. {
  45. kpu->interrupt_clear.reg = 0b111;
  46. kpu->interrupt_mask.reg = 0b110;
  47. layer.dma_parameter.data.send_data_out = 0;
  48. layer.interrupt_enabe.data.int_en = 1;
  49. plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
  50. plic_irq_enable(IRQN_AI_INTERRUPT);
  51. kpu_send_layer(layer);
  52. }
  53. static volatile int g_ai_done = 0;
  54. int kpu_plic_thunk(void *userdata)
  55. {
  56. kpu->interrupt_clear.reg = 0b111;
  57. kpu->interrupt_mask.reg = 0b111;
  58. g_ai_done = 1;
  59. return 0;
  60. }
  61. int kpu_dma_thunk(void *userdata)
  62. {
  63. return 0;
  64. }
  65. void kpu_conv2d_output(kpu_layer_argument_t &layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
  66. {
  67. kpu->interrupt_clear.reg = 0b111;
  68. kpu->interrupt_mask.reg = 0b110;
  69. layer.interrupt_enabe.data.int_en = 1;
  70. layer.dma_parameter.data.send_data_out = 1;
  71. plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
  72. plic_irq_enable(IRQN_AI_INTERRUPT);
  73. sysctl_dma_select((sysctl_dma_channel_t)dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
  74. dmac_set_irq(dma_ch, kpu_dma_thunk, userdata, 1);
  75. dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
  76. DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 1 + 8) / 8 * 1);
  77. kpu_send_layer(layer);
  78. }
  79. void kpu_upload_dma(dmac_channel_number_t dma_ch, const uint8_t *src, uint8_t *dest, size_t input_size, plic_irq_callback_t callback, void *userdata)
  80. {
  81. dmac_set_irq(dma_ch, callback, userdata, 1);
  82. dmac_set_single_mode(dma_ch, (void *)src, (void *)dest, DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
  83. DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
  84. dmac_wait_done(dma_ch);
  85. }
  86. int kpu_dma_plic_thunk(void *userdata)
  87. {
  88. auto &ctx = *reinterpret_cast<k210_interpreter_context *>(userdata);
  89. (ctx.interpreter->*ctx.step)();
  90. return 0;
  91. }
  92. #endif
  93. }
  94. namespace nncase
  95. {
  96. namespace runtime
  97. {
  98. namespace k210
  99. {
  100. kernel_call_result kpu_upload(kpu_upload_options &options, interpreter_t &interpreter, interpreter_step_t step)
  101. {
  102. auto input = interpreter.memory_at<uint8_t>(options.input);
  103. auto output = interpreter.memory_at<uint8_t>(options.output);
  104. kernels::k210::kpu_upload(input.data(), output.data(), options.in_shape);
  105. return kcr_done;
  106. }
  107. kernel_call_result kpu_conv2d(kpu_conv2d_options &options, interpreter_t &interpreter, interpreter_step_t step)
  108. {
  109. auto in_h = static_cast<int32_t>(options.layer.image_size.data.i_col_high + 1);
  110. auto in_w = static_cast<int32_t>(options.layer.image_size.data.i_row_wid + 1);
  111. auto in_ch = static_cast<int32_t>(options.layer.image_channel_num.data.i_ch_num + 1);
  112. auto out_h = static_cast<int32_t>(options.layer.image_size.data.o_col_high + 1);
  113. auto out_w = static_cast<int32_t>(options.layer.image_size.data.o_row_wid + 1);
  114. auto out_ch = static_cast<int32_t>(options.layer.image_channel_num.data.o_ch_num + 1);
  115. auto is_depthwise = options.layer.interrupt_enabe.data.depth_wise_layer != 0;
  116. auto input = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_src_addr * 64, 1 });
  117. auto kpu_out = interpreter.memory_at<uint8_t>({ mem_k210_kpu, dt_uint8, (uint32_t)options.layer.image_addr.data.image_dst_addr * 64, 1 });
  118. runtime_shape_t in_shape { options.batches, in_ch, in_h, in_w };
  119. runtime_shape_t out_shape { options.batches, out_ch, out_h, out_w };
  120. #if NNCASE_TARGET_K210_SIMULATOR
  121. auto in_fmap_size = kernels::details::compute_size(in_shape);
  122. runtime_shape_t conv_out_shape { options.batches, out_ch, in_h, in_w };
  123. auto conv_out_fmap_size = kernels::details::compute_size(conv_out_shape);
  124. auto out_fmap_size = kernels::details::compute_size(out_shape);
  125. auto input_tmp = std::make_unique<uint8_t[]>(in_fmap_size);
  126. auto workspace = std::make_unique<int64_t[]>(conv_out_fmap_size);
  127. auto conv_output_tmp = std::make_unique<uint8_t[]>(conv_out_fmap_size);
  128. auto output_tmp = std::make_unique<uint8_t[]>(out_fmap_size);
  129. auto batch = in_shape[0];
  130. auto in_size_per_batch = kernels::details::compute_size(in_shape) / batch;
  131. auto conv_output_tmp_size_per_batch = conv_out_fmap_size / batch;
  132. auto out_size_per_batch = kernels::details::compute_size(out_shape) / batch;
  133. auto p_input = input_tmp.get();
  134. auto p_workspace = workspace.get();
  135. auto p_conv_ouput_tmp = conv_output_tmp.get();
  136. auto p_output_tmp = output_tmp.get();
  137. kernels::k210::kpu_download(input.data(), input_tmp.get(), in_shape);
  138. auto filter_size = get_kpu_filter_size((kpu_filter_type_t)options.layer.kernel_pool_type_cfg.data.kernel_type);
  139. auto pad_value = (uint8_t)options.layer.kernel_pool_type_cfg.data.pad_value;
  140. auto arg_x = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_x);
  141. auto shift_x = (int32_t)options.layer.conv_value.data.shr_x;
  142. auto arg_w = (int32_t)kernels::details::to_signed<24>(options.layer.conv_value.data.arg_w);
  143. auto shift_w = (int32_t)options.layer.conv_value.data.shr_w;
  144. auto arg_add = kernels::details::to_signed<40>(options.layer.conv_value2.data.arg_add);
  145. auto batchnorm = std::make_unique<kpu_batchnorm_segment[]>(out_ch);
  146. for (size_t i = 0; i < out_ch; i++)
  147. {
  148. auto &src = options.batch_norm[i].batchnorm.data;
  149. auto &dest = batchnorm[i];
  150. dest.mul = (int32_t)kernels::details::to_signed<24>(src.norm_mul);
  151. dest.shift = (int32_t)src.norm_shift;
  152. dest.add = (int32_t)kernels::details::to_signed<32>(src.norm_add);
  153. }
  154. kpu_activation_table_t activation;
  155. for (size_t i = 0; i < 16; i++)
  156. {
  157. auto &src = options.activation->activate_para[i].data;
  158. auto &dest = activation[i];
  159. dest.start_x = kernels::details::to_signed<36>(src.x_start);
  160. dest.mul = (int32_t)kernels::details::to_signed<16>(src.y_mul);
  161. dest.shift = (int32_t)src.shift_number;
  162. if (i < 16)
  163. dest.add = options.activation->activate_para_bias0.data.result_bias[i];
  164. else
  165. dest.add = options.activation->activate_para_bias1.data.result_bias[i - 16];
  166. }
  167. #define KPU_CONV2D_IMPL(is_depthwise_val, filter_size_val) \
  168. if (is_depthwise == is_depthwise_val && filter_size == filter_size_val) \
  169. kernels::k210::kpu_conv2d<is_depthwise_val, filter_size_val>(p_input, p_workspace, p_conv_ouput_tmp, options.weights.data(), \
  170. in_h, in_w, in_ch, out_ch, pad_value, arg_x, shift_x, arg_w, shift_w, arg_add, batchnorm.get(), activation)
  171. for (size_t n = 0; n < batch; n++)
  172. {
  173. KPU_CONV2D_IMPL(true, 1);
  174. else KPU_CONV2D_IMPL(true, 3);
  175. else KPU_CONV2D_IMPL(false, 1);
  176. else KPU_CONV2D_IMPL(false, 3);
  177. kernels::k210::kpu_pool2d(p_conv_ouput_tmp, p_output_tmp, in_h, in_w, out_ch, (kpu_pool_type_t)options.layer.kernel_pool_type_cfg.data.pool_type);
  178. p_input += in_size_per_batch;
  179. p_workspace += conv_output_tmp_size_per_batch;
  180. p_conv_ouput_tmp += conv_output_tmp_size_per_batch;
  181. p_output_tmp += out_size_per_batch;
  182. }
  183. kernels::k210::kpu_upload(output_tmp.get(), kpu_out.data(), out_shape);
  184. if (options.main_mem_output.size)
  185. {
  186. auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
  187. std::copy(output_tmp.get(), output_tmp.get() + out_fmap_size, main_output.data());
  188. }
  189. return kcr_done;
  190. #else
  191. auto &ctx = interpreter.context();
  192. ctx.interpreter = &interpreter;
  193. ctx.step = step;
  194. auto batch = options.batches;
  195. auto in_per_batch = get_kpu_rows(in_w, in_h, in_ch);
  196. auto out_per_batch = get_kpu_rows(out_w, out_h, out_ch);
  197. for (size_t n = 0; n < batch; n++)
  198. {
  199. g_ai_done = 0;
  200. kpu_conv2d_normal(options.layer, kpu_plic_thunk, &ctx);
  201. while (!g_ai_done)
  202. ;
  203. options.layer.image_addr.data.image_src_addr += in_per_batch;
  204. options.layer.image_addr.data.image_dst_addr += out_per_batch;
  205. }
  206. if (options.main_mem_output.size)
  207. {
  208. auto main_output = interpreter.memory_at<uint8_t>(options.main_mem_output);
  209. kernels::k210::kpu_download(kpu_out.data(), main_output.data(), out_shape);
  210. }
  211. return kcr_done;
  212. #endif
  213. }
  214. }
  215. }
  216. }