kpu.c 60 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692
  1. #include <assert.h>
  2. #include <float.h>
  3. #include <math.h>
  4. #include <platform.h>
  5. #include <stdio.h>
  6. #include <stdlib.h>
  7. #include <string.h>
  8. #include <sysctl.h>
  9. #include "bsp.h"
  10. #include "dmac.h"
  11. #include "kpu.h"
  12. #include "printf.h"
  13. #include "nncase.h"
  14. #include "utils.h"
  15. #define LAYER_BURST_SIZE 12
  16. #define KPU_DEBUG 0
  17. #define USE_CACHED_AI_RAM 0
  18. #define min(a, b) (((a) < (b)) ? (a) : (b))
  19. #define max(a, b) (((a) > (b)) ? (a) : (b))
  20. #define ALIGN_UP(x, align) ((x + (align - 1)) & (~(align - 1)))
  21. static int ai_step(void *userdata);
  22. static int kpu_kmodel_done(kpu_model_context_t *ctx);
  23. volatile kpu_config_t *const kpu = (volatile kpu_config_t *)AI_BASE_ADDR;
  24. static volatile uint32_t kpu_status;
  25. typedef struct kpu_context
  26. {
  27. kpu_task_t kpu_task;
  28. uint32_t kpu_status;
  29. } kpu_context_t;
  30. volatile kpu_context_t g_kpu_context;
  31. static int kpu_run_all_done(void *_task)
  32. {
  33. atomic_swap(&g_kpu_context.kpu_status, 0);
  34. kpu_task_t *task = (kpu_task_t *)_task;
  35. task->callback(task);
  36. return 0;
  37. }
  38. int kpu_continue(void *_task)
  39. {
  40. kpu_task_t *task = (kpu_task_t *)_task;
  41. int layer_burst_size = 1;
  42. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  43. .calc_done_int = 1,
  44. .layer_cfg_almost_empty_int = 1,
  45. .layer_cfg_almost_full_int = 1};
  46. if(task->remain_layers_length == 0)
  47. {
  48. return 0;
  49. }
  50. if(task->remain_layers_length <= layer_burst_size)
  51. {
  52. for(uint32_t i = 0; i < task->remain_layers_length; i++)
  53. {
  54. kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
  55. kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
  56. kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
  57. kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
  58. kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
  59. kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
  60. kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
  61. kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
  62. kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
  63. kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
  64. kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
  65. kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
  66. }
  67. task->remain_layers_length = 0;
  68. } else
  69. {
  70. for(uint32_t i = 0; i < layer_burst_size; i++)
  71. {
  72. kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
  73. kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
  74. kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
  75. kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
  76. kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
  77. kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
  78. kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
  79. kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
  80. kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
  81. kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
  82. kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
  83. kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
  84. }
  85. task->remain_layers += layer_burst_size;
  86. task->remain_layers_length -= layer_burst_size;
  87. }
  88. return 0;
  89. }
  90. static int kpu_run_dma_output(uint32_t dma_ch, void *dst, uint32_t length, plic_irq_callback_t cb, void *_task)
  91. {
  92. sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
  93. dmac_irq_register(dma_ch, kpu_run_all_done, _task, 1);
  94. dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), (void *)(dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
  95. DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (length + 7) / 8);
  96. return 0;
  97. }
  98. static int kpu_run_dma_input_done_push_layers(void *_task)
  99. {
  100. kpu_task_t *task = (kpu_task_t *)_task;
  101. kpu->interrupt_clear.reg = 7;
  102. dmac->channel[task->dma_ch].intclear = 0xFFFFFFFF;
  103. kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
  104. .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
  105. kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
  106. .eight_bit_mode = task->eight_bit_mode};
  107. kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
  108. kpu_run_dma_output(task->dma_ch, task->dst, last_layer->dma_parameter.data.dma_total_byte + 1, kpu_run_all_done, task);
  109. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  110. .calc_done_int = 0,
  111. .layer_cfg_almost_empty_int = 0,
  112. .layer_cfg_almost_full_int = 1};
  113. kpu_continue(task);
  114. return 0;
  115. }
  116. static void kpu_run_dma_input(uint32_t dma_ch, const void *src, plic_irq_callback_t cb, void *_task)
  117. {
  118. kpu_task_t *task = _task;
  119. kpu_layer_argument_t *first_layer = &task->layers[0];
  120. uint64_t input_size = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1);
  121. dmac_irq_register(dma_ch, cb, _task, 1);
  122. dmac_set_single_mode(dma_ch, (void *)src, (void *)(AI_IO_BASE_ADDR), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
  123. DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
  124. }
  125. int kpu_run(kpu_task_t *v_task, dmac_channel_number_t dma_ch, const void *src, void *dest, plic_irq_callback_t callback)
  126. {
  127. if(atomic_cas(&g_kpu_context.kpu_status, 0, 1))
  128. return -1;
  129. memcpy((void *)&g_kpu_context.kpu_task, v_task, sizeof(kpu_task_t));
  130. kpu_task_t *task = (kpu_task_t *)&g_kpu_context.kpu_task;
  131. kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
  132. uint64_t output_size = last_layer->dma_parameter.data.dma_total_byte + 1;
  133. last_layer->dma_parameter.data.send_data_out = 1;
  134. last_layer->interrupt_enabe.data.int_en = 1;
  135. task->dma_ch = dma_ch;
  136. task->dst = dest;
  137. task->dst_length = output_size;
  138. task->callback = callback;
  139. task->remain_layers_length = task->layers_length;
  140. task->remain_layers = task->layers;
  141. plic_set_priority(IRQN_AI_INTERRUPT, 1);
  142. plic_irq_register(IRQN_AI_INTERRUPT, kpu_continue, task);
  143. plic_irq_enable(IRQN_AI_INTERRUPT);
  144. kpu_run_dma_input(dma_ch, src, kpu_run_dma_input_done_push_layers, task);
  145. return 0;
  146. }
  147. uint8_t *kpu_get_output_buf(kpu_task_t *task)
  148. {
  149. kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
  150. size_t output_size = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8 * 8;
  151. return malloc(output_size);
  152. }
  153. void kpu_release_output_buf(uint8_t *output_buf)
  154. {
  155. if(output_buf != NULL)
  156. free(output_buf);
  157. }
  158. static int kpu_done(void *ctx)
  159. {
  160. atomic_swap(&kpu_status, 0);
  161. kpu_task_t *task = (kpu_task_t *)ctx;
  162. task->callback(task->ctx);
  163. return 0;
  164. }
  165. static int kpu_config_input(void *ctx)
  166. {
  167. kpu_task_t *task = (kpu_task_t *)ctx;
  168. kpu->interrupt_clear.reg = 7;
  169. if(task->remain_layers_length <= LAYER_BURST_SIZE)
  170. {
  171. for(uint32_t i = 0; i < task->remain_layers_length; i++)
  172. {
  173. kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
  174. kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
  175. kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
  176. kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
  177. kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
  178. kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
  179. kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
  180. kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
  181. kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
  182. kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
  183. kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
  184. kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
  185. }
  186. task->remain_layers_length = 0;
  187. kpu->interrupt_mask.reg = 7;
  188. } else
  189. {
  190. for(uint32_t i = 0; i < LAYER_BURST_SIZE; i++)
  191. {
  192. kpu->layer_argument_fifo = task->remain_layers[i].interrupt_enabe.reg;
  193. kpu->layer_argument_fifo = task->remain_layers[i].image_addr.reg;
  194. kpu->layer_argument_fifo = task->remain_layers[i].image_channel_num.reg;
  195. kpu->layer_argument_fifo = task->remain_layers[i].image_size.reg;
  196. kpu->layer_argument_fifo = task->remain_layers[i].kernel_pool_type_cfg.reg;
  197. kpu->layer_argument_fifo = task->remain_layers[i].kernel_load_cfg.reg;
  198. kpu->layer_argument_fifo = task->remain_layers[i].kernel_offset.reg;
  199. kpu->layer_argument_fifo = task->remain_layers[i].kernel_calc_type_cfg.reg;
  200. kpu->layer_argument_fifo = task->remain_layers[i].write_back_cfg.reg;
  201. kpu->layer_argument_fifo = task->remain_layers[i].conv_value.reg;
  202. kpu->layer_argument_fifo = task->remain_layers[i].conv_value2.reg;
  203. kpu->layer_argument_fifo = task->remain_layers[i].dma_parameter.reg;
  204. }
  205. task->remain_layers += LAYER_BURST_SIZE;
  206. task->remain_layers_length -= LAYER_BURST_SIZE;
  207. }
  208. return 0;
  209. }
  210. static void kpu_data_output(kpu_task_t *task)
  211. {
  212. sysctl_dma_select(task->dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
  213. dmac_irq_register(task->dma_ch, kpu_done, task, 1);
  214. dmac_set_single_mode(task->dma_ch, (void *)(&kpu->fifo_data_out), (void *)(task->dst), DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
  215. DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, task->dst_length);
  216. }
  217. static int kpu_data_ready(void *ctx)
  218. {
  219. kpu_task_t *task = (kpu_task_t *)ctx;
  220. dmac->channel[task->dma_ch].intclear = 0xFFFFFFFF;
  221. kpu_data_output(task);
  222. kpu->eight_bit_mode.reg = task->eight_bit_mode;
  223. kpu->interrupt_mask.reg = 7;
  224. kpu->interrupt_clear.reg = 7;
  225. kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
  226. .fifo_full_threshold = 12, .fifo_empty_threshold = 1};
  227. plic_set_priority(IRQN_AI_INTERRUPT, 2);
  228. plic_irq_register(IRQN_AI_INTERRUPT, kpu_config_input, task);
  229. plic_irq_enable(IRQN_AI_INTERRUPT);
  230. kpu_config_input(task);
  231. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  232. .calc_done_int = 1,
  233. .layer_cfg_almost_empty_int = 0,
  234. .layer_cfg_almost_full_int = 1};
  235. return 0;
  236. }
  237. static void kpu_data_input(kpu_task_t *task)
  238. {
  239. if(task->src == NULL)
  240. {
  241. kpu_data_ready(task);
  242. return;
  243. }
  244. dmac_irq_register(task->dma_ch, kpu_data_ready, task, 1);
  245. kpu_layer_argument_t *layer = &task->layers[0];
  246. dmac_set_single_mode(task->dma_ch, (void *)(uintptr_t)task->src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
  247. DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, task->src_length);
  248. }
  249. int kpu_single_task_init(kpu_task_t *task)
  250. {
  251. sysctl_clock_enable(SYSCTL_CLOCK_AI);
  252. kpu_layer_argument_t *first_layer = &task->layers[0];
  253. kpu_layer_argument_t *last_layer = &task->layers[task->layers_length - 1];
  254. last_layer->dma_parameter.data.send_data_out = 1;
  255. last_layer->interrupt_enabe.data.int_en = 1;
  256. task->src_length = first_layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (first_layer->image_channel_num.data.i_ch_num + 1) / 8;
  257. task->dst_length = ((last_layer->dma_parameter.data.dma_total_byte + 1) + 7) / 8;
  258. task->dst = (uint64_t *)malloc(task->dst_length * 8);
  259. memset(task->dst, 0, task->dst_length * 8);
  260. if(task->dst == NULL)
  261. return 1;
  262. return 0;
  263. }
  264. int kpu_single_task_deinit(kpu_task_t *task)
  265. {
  266. free(task->dst);
  267. return 0;
  268. }
  269. int kpu_model_load_from_buffer(kpu_task_t *task, uint8_t *buffer, kpu_model_layer_metadata_t **meta)
  270. {
  271. uintptr_t base_addr = (uintptr_t)buffer;
  272. kpu_model_header_t *header = (kpu_model_header_t *)buffer;
  273. kpu_model_layer_metadata_t *layer_meta = (kpu_model_layer_metadata_t *)(base_addr + sizeof(kpu_model_header_t));
  274. kpu_layer_argument_t *layers = (kpu_layer_argument_t *)(base_addr + header->layers_argument_start);
  275. if(header->version != 1)
  276. return -1;
  277. uint32_t layers_length = header->layers_length;
  278. task->layers_length = layers_length;
  279. task->eight_bit_mode = header->flags & 1;
  280. task->layers = layers;
  281. task->output_scale = layer_meta[layers_length - 1].output_scale;
  282. task->output_bias = layer_meta[layers_length - 1].output_bias;
  283. size_t i;
  284. for(i = 0; i < layers_length; i++)
  285. {
  286. layers[i].kernel_load_cfg.data.para_start_addr = (uint64_t)(base_addr + layer_meta[i].weigths_offset);
  287. layers[i].kernel_pool_type_cfg.data.bwsx_base_addr = (uint64_t)(base_addr + layer_meta[i].bn_offset);
  288. layers[i].kernel_calc_type_cfg.data.active_addr = (uint64_t)(base_addr + layer_meta[i].act_offset);
  289. }
  290. if(meta)
  291. *meta = layer_meta;
  292. return 0;
  293. }
  294. int kpu_start(kpu_task_t *task)
  295. {
  296. if(atomic_cas(&kpu_status, 0, 1))
  297. return -1;
  298. task->remain_layers_length = task->layers_length;
  299. task->remain_layers = task->layers;
  300. kpu_data_input(task);
  301. return 0;
  302. }
  303. static void kpu_send_layer(const kpu_layer_argument_t *layer)
  304. {
  305. kpu->layer_argument_fifo = layer->interrupt_enabe.reg;
  306. kpu->layer_argument_fifo = layer->image_addr.reg;
  307. kpu->layer_argument_fifo = layer->image_channel_num.reg;
  308. kpu->layer_argument_fifo = layer->image_size.reg;
  309. kpu->layer_argument_fifo = layer->kernel_pool_type_cfg.reg;
  310. kpu->layer_argument_fifo = layer->kernel_load_cfg.reg;
  311. kpu->layer_argument_fifo = layer->kernel_offset.reg;
  312. kpu->layer_argument_fifo = layer->kernel_calc_type_cfg.reg;
  313. kpu->layer_argument_fifo = layer->write_back_cfg.reg;
  314. kpu->layer_argument_fifo = layer->conv_value.reg;
  315. kpu->layer_argument_fifo = layer->conv_value2.reg;
  316. kpu->layer_argument_fifo = layer->dma_parameter.reg;
  317. }
  318. void kpu_init(int eight_bit_mode, plic_irq_callback_t callback, void *userdata)
  319. {
  320. kpu->interrupt_clear.reg = 7;
  321. kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
  322. .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
  323. kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
  324. .eight_bit_mode = eight_bit_mode};
  325. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  326. .calc_done_int = 1,
  327. .layer_cfg_almost_empty_int = 0,
  328. .layer_cfg_almost_full_int = 1};
  329. plic_set_priority(IRQN_AI_INTERRUPT, 1);
  330. plic_irq_register(IRQN_AI_INTERRUPT, callback, userdata);
  331. plic_irq_enable(IRQN_AI_INTERRUPT);
  332. }
  333. void kpu_input_dma(const kpu_layer_argument_t *layer, const uint8_t *src, dmac_channel_number_t dma_ch, plic_irq_callback_t callback, void *userdata)
  334. {
  335. uint64_t input_size = layer->kernel_calc_type_cfg.data.channel_switch_addr * 64 * (layer->image_channel_num.data.i_ch_num + 1);
  336. dmac_set_irq(dma_ch, callback, userdata, 1);
  337. dmac_set_single_mode(dma_ch, (void *)src, (void *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64), DMAC_ADDR_INCREMENT, DMAC_ADDR_INCREMENT,
  338. DMAC_MSIZE_16, DMAC_TRANS_WIDTH_64, input_size / 8);
  339. }
  340. static void kpu_conv2d_core(kpu_layer_argument_t *layer)
  341. {
  342. kpu_send_layer(layer);
  343. }
  344. void kpu_conv2d(kpu_layer_argument_t *layer)
  345. {
  346. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  347. .calc_done_int = 1,
  348. .layer_cfg_almost_empty_int = 1,
  349. .layer_cfg_almost_full_int = 1};
  350. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  351. .calc_done_int = 1,
  352. .layer_cfg_almost_empty_int = 0,
  353. .layer_cfg_almost_full_int = 1};
  354. kpu_conv2d_core(layer);
  355. }
  356. void kpu_conv2d_output(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint8_t *dest, plic_irq_callback_t callback, void *userdata)
  357. {
  358. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  359. .calc_done_int = 1,
  360. .layer_cfg_almost_empty_int = 1,
  361. .layer_cfg_almost_full_int = 1};
  362. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  363. .calc_done_int = 1,
  364. .layer_cfg_almost_empty_int = 1,
  365. .layer_cfg_almost_full_int = 1};
  366. layer->dma_parameter.data.send_data_out = 1;
  367. sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
  368. dmac_set_irq(dma_ch, callback, userdata, 1);
  369. dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
  370. DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer->dma_parameter.data.dma_total_byte + 8) / 8);
  371. kpu_conv2d_core(layer);
  372. }
  373. void kpu_conv2d_output_full_add(kpu_layer_argument_t *layer, dmac_channel_number_t dma_ch, uint64_t *dest, plic_irq_callback_t callback, void *userdata)
  374. {
  375. uint32_t channels = layer->image_channel_num.data.o_ch_num + 1;
  376. layer->interrupt_enabe.data.full_add = 1;
  377. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  378. .calc_done_int = 1,
  379. .layer_cfg_almost_empty_int = 1,
  380. .layer_cfg_almost_full_int = 1};
  381. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  382. .calc_done_int = 1,
  383. .layer_cfg_almost_empty_int = 1,
  384. .layer_cfg_almost_full_int = 1};
  385. layer->dma_parameter.data.send_data_out = 1;
  386. sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
  387. dmac_set_irq(dma_ch, callback, userdata, 1);
  388. dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
  389. DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, channels);
  390. kpu_conv2d_core(layer);
  391. }
  392. void kpu_add(const uint8_t *src1, const quantize_param_t *src1_param, const uint8_t *src2, const quantize_param_t *src2_param, size_t count, uint8_t *dest, const quantize_param_t *dest_param)
  393. {
  394. quantize_param_t q1 = *src1_param, q2 = *src2_param, q3 = *dest_param;
  395. size_t i;
  396. for(i = 0; i < count; i++)
  397. {
  398. int value = ((*src1++ * q1.scale + q1.bias + *src2++ * q2.scale + q2.bias) - q3.bias) / q3.scale;
  399. if(value < 0)
  400. value = 0;
  401. if(value > 0xFF)
  402. value = 0xFF;
  403. *dest++ = value;
  404. }
  405. }
  406. void kpu_global_average_pool(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, uint8_t *dest, const quantize_param_t *dest_param)
  407. {
  408. quantize_param_t q1 = *src_param, q2 = *dest_param;
  409. size_t oc, y, x;
  410. if(((uintptr_t)dest) >= AI_IO_BASE_ADDR && ((uintptr_t)dest) < AI_IO_BASE_ADDR + 2 * 1024 * 1024)
  411. {
  412. uint32_t row_padding = 16;
  413. uint32_t row_group = 4;
  414. uint32_t row_length = 1;
  415. uint32_t height = 4;
  416. for(oc = 0; oc < channels; oc++)
  417. {
  418. uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
  419. for(y = 0; y < 1; y++)
  420. {
  421. uint8_t *y_origin = channel_origin + y * row_length * 64;
  422. for(x = 0; x < 1; x++)
  423. {
  424. int64_t sum = 0;
  425. size_t i;
  426. for(i = 0; i < kernel_size; i++)
  427. sum += *src++;
  428. int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
  429. if(value < 0)
  430. value = 0;
  431. if(value > 0xFF)
  432. value = 0xFF;
  433. y_origin[x] = value;
  434. }
  435. }
  436. }
  437. } else
  438. {
  439. for(oc = 0; oc < channels; oc++)
  440. {
  441. int64_t sum = 0;
  442. size_t i;
  443. for(i = 0; i < kernel_size; i++)
  444. sum += *src++;
  445. int value = ((sum * q1.scale + q1.bias) / kernel_size - q2.bias) / q2.scale;
  446. if(value < 0)
  447. value = 0;
  448. if(value > 0xFF)
  449. value = 0xFF;
  450. dest[oc] = value;
  451. }
  452. }
  453. }
  454. void kpu_global_average_pool_float(const uint8_t *src, const quantize_param_t *src_param, int kernel_size, int channels, float *dest)
  455. {
  456. quantize_param_t q = *src_param;
  457. size_t oc;
  458. for(oc = 0; oc < channels; oc++)
  459. {
  460. int64_t sum = 0;
  461. size_t i;
  462. for(i = 0; i < kernel_size; i++)
  463. sum += *src++;
  464. float value = (sum * q.scale + q.bias) / kernel_size;
  465. dest[oc] = value;
  466. }
  467. }
  468. void kpu_matmul_end(const uint8_t *src, int channels, float *dest, const quantize_param_t *dest_param)
  469. {
  470. quantize_param_t q1 = *dest_param;
  471. size_t i = 0;
  472. for(i = 0; i < channels; i++)
  473. *dest++ = src[i * 16] * q1.scale + q1.bias;
  474. }
  475. void kpu_fully_connected(const float *src, const float *weights, const float *biases, float *dest, int input_channels, int output_channels)
  476. {
  477. int ic, oc;
  478. for(oc = 0; oc < output_channels; oc++)
  479. {
  480. const float *c_weights = weights + oc * input_channels;
  481. float sum = 0.0f;
  482. for(ic = 0; ic < input_channels; ic++)
  483. sum += src[ic] * c_weights[ic];
  484. dest[oc] = sum + biases[oc];
  485. }
  486. }
  487. void kpu_dequantize(const uint8_t *src, const quantize_param_t *src_param, size_t count, float *dest)
  488. {
  489. quantize_param_t q1 = *src_param;
  490. size_t i = 0;
  491. for(i = 0; i < count; i++)
  492. *dest++ = src[i] * q1.scale + q1.bias;
  493. }
  494. void kpu_input_with_padding(kpu_layer_argument_t *layer, const uint8_t *src, int width, int height, int channels)
  495. {
  496. uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + layer->image_addr.data.image_src_addr * 64);
  497. size_t oc, y, x;
  498. uint32_t row_padding;
  499. uint32_t row_group;
  500. uint32_t row_length;
  501. if(width <= 16)
  502. {
  503. row_padding = 16;
  504. row_group = 4;
  505. row_length = 1;
  506. } else if(width <= 32)
  507. {
  508. row_padding = 32;
  509. row_group = 2;
  510. row_length = 1;
  511. } else
  512. {
  513. row_padding = 64;
  514. row_group = 1;
  515. row_length = (width + 63) / 64;
  516. }
  517. for(oc = 0; oc < channels; oc++)
  518. {
  519. uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
  520. for(y = 0; y < height; y++)
  521. {
  522. uint8_t *y_origin = channel_origin + y * row_length * 64;
  523. for(x = 0; x < width; x++)
  524. y_origin[x] = *src++;
  525. }
  526. }
  527. }
  528. #if USE_CACHED_AI_RAM
  529. static void kpu_flush_cache(uint32_t addr, size_t lines)
  530. {
  531. size_t line;
  532. for(line = 0; line < lines; line++)
  533. {
  534. const uint64_t *src = (const uint64_t *)(AI_RAM_BASE_ADDR + (addr + line) * 64);
  535. uint64_t *dest = (uint64_t *)(AI_IO_BASE_ADDR + (addr + line) * 64);
  536. size_t i;
  537. for(i = 0; i < 8; i++)
  538. dest[i] = src[i];
  539. }
  540. }
  541. #endif
  542. static int64_t kpu_carry_shift(int64_t value, uint32_t shift)
  543. {
  544. if(shift > 0)
  545. {
  546. value >>= shift - 1;
  547. if(value & 0x1)
  548. {
  549. if(value < 0)
  550. value = (value >> 1) - 1;
  551. else
  552. value = (value >> 1) + 1;
  553. } else
  554. {
  555. value >>= 1;
  556. }
  557. }
  558. return value;
  559. }
  560. static void kpu_upload_core(size_t width, size_t height, size_t channels, const uint8_t *src, uint32_t kpu_addr)
  561. {
  562. uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + kpu_addr * 64);
  563. size_t oc, y, x;
  564. uint32_t row_padding;
  565. uint32_t row_group;
  566. uint32_t row_length;
  567. if(width <= 16)
  568. {
  569. row_padding = 16;
  570. row_group = 4;
  571. row_length = 1;
  572. } else if(width <= 32)
  573. {
  574. row_padding = 32;
  575. row_group = 2;
  576. row_length = 1;
  577. } else
  578. {
  579. row_padding = 64;
  580. row_group = 1;
  581. row_length = (width + 63) / 64;
  582. }
  583. if((uintptr_t)src % 8 == 0 && width % 8 == 0)
  584. {
  585. #define UPLOAD_BEGIN() \
  586. for(oc = 0; oc < channels; oc++) \
  587. { \
  588. uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding; \
  589. for(y = 0; y < height; y++) \
  590. { \
  591. uint64_t *y_origin = (uint64_t *)(channel_origin + y * row_length * 64);
  592. #define UPLOAD_END() \
  593. } \
  594. }
  595. width /= 8;
  596. const uint64_t *u64_src = (const uint64_t *)src;
  597. if(width == 1)
  598. {
  599. UPLOAD_BEGIN()
  600. y_origin[0] = *u64_src++;
  601. UPLOAD_END()
  602. } else if(width == 2)
  603. {
  604. UPLOAD_BEGIN()
  605. {
  606. y_origin[0] = *u64_src++;
  607. y_origin[1] = *u64_src++;
  608. }
  609. UPLOAD_END()
  610. } else if(width == 4)
  611. {
  612. UPLOAD_BEGIN()
  613. {
  614. y_origin[0] = *u64_src++;
  615. y_origin[1] = *u64_src++;
  616. y_origin[2] = *u64_src++;
  617. y_origin[3] = *u64_src++;
  618. }
  619. UPLOAD_END()
  620. } else
  621. {
  622. UPLOAD_BEGIN()
  623. for(x = 0; x < width; x++)
  624. y_origin[x] = *u64_src++;
  625. UPLOAD_END()
  626. }
  627. } else
  628. {
  629. for(oc = 0; oc < channels; oc++)
  630. {
  631. uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
  632. for(y = 0; y < height; y++)
  633. {
  634. uint8_t *y_origin = channel_origin + y * row_length * 64;
  635. for(x = 0; x < width; x++)
  636. y_origin[x] = *src++;
  637. }
  638. }
  639. }
  640. }
  641. static void kpu_kmodel_input_with_padding(const kpu_layer_argument_t *layer, const uint8_t *src)
  642. {
  643. size_t width = layer->image_size.data.i_row_wid + 1;
  644. size_t height = layer->image_size.data.i_col_high + 1;
  645. size_t channels = layer->image_channel_num.data.i_ch_num + 1;
  646. kpu_upload_core(width, height, channels, src, layer->image_addr.data.image_src_addr);
  647. }
  648. static void kpu_kmodel_input_float(const float *src, float *dest, size_t count)
  649. {
  650. memcpy(dest, src, count * sizeof(float));
  651. }
  652. static void kpu_float_activation(float *data, size_t count, kpu_model_activation_t act)
  653. {
  654. size_t i;
  655. if(act == KLA_RELU)
  656. {
  657. for(i = 0; i < count; i++)
  658. data[i] = max(data[i], 0);
  659. } else if(act == KLA_RELU6)
  660. {
  661. for(i = 0; i < count; i++)
  662. data[i] = min(max(data[i], 0), 6);
  663. }
  664. }
  665. static void kpu_kmodel_add(const kpu_model_add_layer_argument_t *arg, kpu_model_context_t *ctx)
  666. {
  667. const float *src_a = (const float *)(ctx->main_buffer + arg->main_mem_in_a_address);
  668. const float *src_b = (const float *)(ctx->main_buffer + arg->main_mem_in_b_address);
  669. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  670. size_t i, count = arg->count;
  671. for(i = 0; i < count; i++)
  672. dest[i] = src_a[i] + src_b[i];
  673. }
  674. static void kpu_quantized_add(const kpu_model_quant_add_layer_argument_t *arg, kpu_model_context_t *ctx)
  675. {
  676. const uint8_t *src_a = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_a_address);
  677. const uint8_t *src_b = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_b_address);
  678. size_t count = ALIGN_UP(arg->count, 8) / 8;
  679. int64_t off_a = arg->in_a_offset, mul_a = arg->in_a_mul, sh_a = arg->in_a_shift;
  680. int64_t off_b = arg->in_b_offset, mul_b = arg->in_b_mul, sh_b = arg->in_b_shift;
  681. int64_t off_o = arg->out_offset, mul_o = arg->out_mul, sh_o = arg->out_shift;
  682. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
  683. size_t i;
  684. if(sh_a == sh_b)
  685. {
  686. #define QADD_UNROLL_1(x) \
  687. int64_t a##x = *src_a++; \
  688. int64_t b##x = *src_b++;
  689. #define QADD_UNROLL_2(x) \
  690. a##x += off_a; \
  691. b##x += off_b;
  692. #define QADD_UNROLL_3(x) \
  693. a##x *= mul_a; \
  694. b##x *= mul_b;
  695. #define QADD_UNROLL_4(x) \
  696. int64_t v##x = a##x + b##x;
  697. #define QADD_UNROLL_5(x) \
  698. v##x >>= sh_a;
  699. #define QADD_UNROLL_6(x) \
  700. v##x *= mul_o;
  701. #define QADD_UNROLL_7(x) \
  702. v##x = kpu_carry_shift(v##x, sh_o);
  703. #define QADD_UNROLL_8(x) \
  704. v##x += off_o;
  705. #define QADD_UNROLL_9(x) \
  706. v##x = min(0xFF, max(0, v##x));
  707. #define QADD_UNROLL_10(x) \
  708. *dest++ = v##x;
  709. #define QADD_UNROLL_S(x) \
  710. QADD_UNROLL_##x(0) \
  711. QADD_UNROLL_##x(1) \
  712. QADD_UNROLL_##x(2) \
  713. QADD_UNROLL_##x(3) \
  714. QADD_UNROLL_##x(4) \
  715. QADD_UNROLL_##x(5) \
  716. QADD_UNROLL_##x(6) \
  717. QADD_UNROLL_##x(7)
  718. for(i = 0; i < count; i++)
  719. {
  720. QADD_UNROLL_S(1);
  721. QADD_UNROLL_S(2);
  722. QADD_UNROLL_S(3);
  723. QADD_UNROLL_S(4);
  724. QADD_UNROLL_S(5);
  725. QADD_UNROLL_S(6);
  726. QADD_UNROLL_S(7);
  727. QADD_UNROLL_S(8);
  728. QADD_UNROLL_S(9);
  729. QADD_UNROLL_S(10);
  730. }
  731. } else
  732. {
  733. #undef QADD_UNROLL_1
  734. #define QADD_UNROLL_1(x) \
  735. int64_t a##x = *src_a++; \
  736. int64_t b##x = *src_b++;
  737. #undef QADD_UNROLL_2
  738. #define QADD_UNROLL_2(x) \
  739. a##x += off_a; \
  740. b##x += off_b;
  741. #undef QADD_UNROLL_3
  742. #define QADD_UNROLL_3(x) \
  743. a##x *= mul_a; \
  744. b##x *= mul_b;
  745. #undef QADD_UNROLL_4
  746. #define QADD_UNROLL_4(x) \
  747. a##x >>= sh_a; \
  748. b##x >>= sh_b;
  749. #undef QADD_UNROLL_5
  750. #define QADD_UNROLL_5(x) \
  751. int64_t v##x = a##x + b##x;
  752. #undef QADD_UNROLL_6
  753. #define QADD_UNROLL_6(x) \
  754. v##x *= mul_o;
  755. #undef QADD_UNROLL_7
  756. #define QADD_UNROLL_7(x) \
  757. v##x = kpu_carry_shift(v##x, sh_o);
  758. #undef QADD_UNROLL_8
  759. #define QADD_UNROLL_8(x) \
  760. v##x += off_o;
  761. #undef QADD_UNROLL_9
  762. #define QADD_UNROLL_9(x) \
  763. v##x = min(0xFF, max(0, v##x));
  764. #undef QADD_UNROLL_10
  765. #define QADD_UNROLL_10(x) \
  766. *dest++ = v##x;
  767. #undef QADD_UNROLL_S
  768. #define QADD_UNROLL_S(x) \
  769. QADD_UNROLL_##x(0) \
  770. QADD_UNROLL_##x(1) \
  771. QADD_UNROLL_##x(2) \
  772. QADD_UNROLL_##x(3) \
  773. QADD_UNROLL_##x(4) \
  774. QADD_UNROLL_##x(5) \
  775. QADD_UNROLL_##x(6) \
  776. QADD_UNROLL_##x(7)
  777. for(i = 0; i < count; i++)
  778. {
  779. QADD_UNROLL_S(1);
  780. QADD_UNROLL_S(2);
  781. QADD_UNROLL_S(3);
  782. QADD_UNROLL_S(4);
  783. QADD_UNROLL_S(5);
  784. QADD_UNROLL_S(6);
  785. QADD_UNROLL_S(7);
  786. QADD_UNROLL_S(8);
  787. QADD_UNROLL_S(9);
  788. QADD_UNROLL_S(10);
  789. }
  790. }
  791. }
  792. static void kpu_global_average_pool2d(const kpu_model_gap2d_layer_argument_t *arg, kpu_model_context_t *ctx)
  793. {
  794. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  795. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  796. size_t oc, channels = arg->channels, kernel_size = arg->kernel_size;
  797. for(oc = 0; oc < channels; oc++)
  798. {
  799. float sum = 0.f;
  800. size_t i;
  801. for(i = 0; i < kernel_size; i++)
  802. sum += *src++;
  803. dest[oc] = sum / kernel_size;
  804. }
  805. }
  806. static void kpu_quantized_max_pool2d(const kpu_model_quant_max_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
  807. {
  808. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  809. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
  810. kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
  811. uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
  812. uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
  813. uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
  814. uint32_t out_y, out_x, oc;
  815. for(oc = 0; oc < out_shape.channels; oc++)
  816. {
  817. const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
  818. for(out_y = 0; out_y < out_shape.height; out_y++)
  819. {
  820. for(out_x = 0; out_x < out_shape.width; out_x++)
  821. {
  822. int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
  823. int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
  824. int32_t kernel_x_start = max(0, -in_x_origin);
  825. int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
  826. int32_t kernel_y_start = max(0, -in_y_origin);
  827. int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
  828. uint8_t value = 0;
  829. int32_t kernel_y, kernel_x;
  830. for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
  831. {
  832. for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
  833. {
  834. int32_t in_x = in_x_origin + kernel_x;
  835. int32_t in_y = in_y_origin + kernel_y;
  836. value = max(value, channel_src[in_y * in_shape.width + in_x]);
  837. }
  838. }
  839. *dest++ = value;
  840. }
  841. }
  842. }
  843. }
  844. static void kpu_average_pool2d(const kpu_model_ave_pool2d_layer_argument_t *arg, kpu_model_context_t *ctx)
  845. {
  846. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  847. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  848. kpu_model_shape_t in_shape = arg->in_shape, out_shape = arg->out_shape;
  849. uint32_t kernel_width = arg->kernel_width, kernel_height = arg->kernel_height;
  850. uint32_t stride_width = arg->stride_width, stride_height = arg->stride_height;
  851. uint32_t padding_width = arg->padding_width, padding_height = arg->padding_height;
  852. uint32_t out_y, out_x, oc;
  853. for(oc = 0; oc < out_shape.channels; oc++)
  854. {
  855. const float *channel_src = src + in_shape.width * in_shape.height * oc;
  856. for(out_y = 0; out_y < out_shape.height; out_y++)
  857. {
  858. for(out_x = 0; out_x < out_shape.width; out_x++)
  859. {
  860. int32_t in_x_origin = (int32_t)(out_x * stride_width) - padding_width;
  861. int32_t in_y_origin = (int32_t)(out_y * stride_height) - padding_height;
  862. int32_t kernel_x_start = max(0, -in_x_origin);
  863. int32_t kernel_x_end = min(kernel_width, in_shape.width - in_x_origin);
  864. int32_t kernel_y_start = max(0, -in_y_origin);
  865. int32_t kernel_y_end = min(kernel_height, in_shape.height - in_y_origin);
  866. float value = 0;
  867. float kernel_count = 0;
  868. int32_t kernel_y, kernel_x;
  869. for(kernel_y = kernel_y_start; kernel_y < kernel_y_end; kernel_y++)
  870. {
  871. for(kernel_x = kernel_x_start; kernel_x < kernel_x_end; kernel_x++)
  872. {
  873. int32_t in_x = in_x_origin + kernel_x;
  874. int32_t in_y = in_y_origin + kernel_y;
  875. value += channel_src[in_y * in_shape.width + in_x];
  876. kernel_count++;
  877. }
  878. }
  879. *dest++ = value / kernel_count;
  880. }
  881. }
  882. }
  883. }
  884. static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_model_context_t *ctx)
  885. {
  886. size_t count = arg->count;
  887. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  888. kpu_model_quant_param_t q = arg->quant_param;
  889. float scale = 1.f / q.scale;
  890. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
  891. size_t i;
  892. for(i = 0; i < count; i++)
  893. {
  894. int value = roundf((*src++ - q.bias) * scale);
  895. if(value < 0)
  896. value = 0;
  897. if(value > 0xFF)
  898. value = 0xFF;
  899. *dest++ = (uint8_t)value;
  900. }
  901. }
  902. static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *arg, kpu_model_context_t *ctx)
  903. {
  904. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  905. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  906. size_t oc, count = arg->count;
  907. kpu_model_quant_param_t q = arg->quant_param;
  908. for(oc = 0; oc < count; oc++)
  909. dest[oc] = *src++ * q.scale + q.bias;
  910. }
  911. static void kpu_kmodel_channelwise_dequantize(const kpu_model_channelwise_dequant_argument_t *arg, kpu_model_context_t *ctx)
  912. {
  913. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  914. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  915. size_t oc, i, channels = arg->channels, count = arg->channel_size;
  916. for(oc = 0; oc < channels; oc++)
  917. {
  918. const kpu_model_quant_param_t q = arg->quant_params[oc];
  919. for(i = 0; i < count; i++)
  920. *dest++ = *src++ * q.scale + q.bias;
  921. }
  922. }
  923. static void kpu_requantize(const kpu_model_requantize_layer_argument_t *arg, kpu_model_context_t *ctx)
  924. {
  925. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  926. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
  927. size_t oc, count = arg->count;
  928. const uint8_t *table = arg->table;
  929. if(false && count % 8 == 0)
  930. {
  931. for(oc = 0; oc < count;)
  932. {
  933. dest[oc++] = table[*src++];
  934. dest[oc++] = table[*src++];
  935. dest[oc++] = table[*src++];
  936. dest[oc++] = table[*src++];
  937. dest[oc++] = table[*src++];
  938. dest[oc++] = table[*src++];
  939. dest[oc++] = table[*src++];
  940. dest[oc++] = table[*src++];
  941. }
  942. } else
  943. {
  944. for(oc = 0; oc < count; oc++)
  945. dest[oc] = table[src[oc]];
  946. }
  947. }
  948. static void kpu_l2_normalization(const kpu_model_l2_norm_layer_argument_t *arg, kpu_model_context_t *ctx)
  949. {
  950. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  951. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  952. size_t oc, channels = arg->channels;
  953. float sum = 0.f;
  954. const float epsilon = 1e-10f;
  955. for(oc = 0; oc < channels; oc++)
  956. sum += src[oc] * src[oc];
  957. if(sum < epsilon)
  958. sum = epsilon;
  959. sum = 1.f / sqrtf(sum);
  960. for(oc = 0; oc < channels; oc++)
  961. dest[oc] = src[oc] * sum;
  962. }
  963. static void kpu_softmax(const kpu_model_softmax_layer_argument_t *arg, kpu_model_context_t *ctx)
  964. {
  965. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  966. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  967. size_t oc, channels = arg->channels;
  968. float max = FLT_MIN;
  969. for(oc = 0; oc < channels; oc++)
  970. max = fmaxf(max, src[oc]);
  971. float sum = 0.f;
  972. for(oc = 0; oc < channels; oc++)
  973. {
  974. float value = expf(src[oc] - max);
  975. sum += value;
  976. dest[oc] = value;
  977. }
  978. for(oc = 0; oc < channels; oc++)
  979. dest[oc] /= sum;
  980. }
  981. static void kpu_concat(const kpu_model_concat_layer_argument_t *arg, kpu_model_context_t *ctx)
  982. {
  983. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
  984. uint32_t count = arg->input_count, i;
  985. for(i = 0; i < count; i++)
  986. {
  987. kpu_model_memory_range_t input = arg->inputs_mem[i];
  988. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + input.start);
  989. memcpy(dest, src, input.size);
  990. dest += input.size;
  991. }
  992. }
  993. static void kpu_kmodel_fully_connected(const kpu_model_fully_connected_layer_argument_t *arg, kpu_model_context_t *ctx)
  994. {
  995. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  996. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  997. uint32_t in_channels = arg->in_channels, out_channels = arg->out_channels, ic, oc;
  998. float *weights = (float *)malloc(in_channels * out_channels * sizeof(float));
  999. float *bias = (float *)malloc(out_channels * sizeof(float));
  1000. memcpy(weights, arg->weights, out_channels * in_channels * sizeof(float));
  1001. memcpy(bias, arg->weights + in_channels * out_channels, out_channels * sizeof(float));
  1002. if(in_channels % 8 == 0)
  1003. {
  1004. #define FC_UNROLL_1(x) \
  1005. float i##x = *c_src++; \
  1006. float w##x = *c_weights++;
  1007. #define FC_UNROLL_2(x) \
  1008. sum += i##x * w##x;
  1009. #define FC_UNROLL_S(x) \
  1010. FC_UNROLL_##x(0) \
  1011. FC_UNROLL_##x(1) \
  1012. FC_UNROLL_##x(2) \
  1013. FC_UNROLL_##x(3) \
  1014. FC_UNROLL_##x(4) \
  1015. FC_UNROLL_##x(5) \
  1016. FC_UNROLL_##x(6) \
  1017. FC_UNROLL_##x(7)
  1018. for(oc = 0; oc < out_channels; oc++)
  1019. {
  1020. const float *c_src = src;
  1021. const float *c_weights = weights + oc * in_channels;
  1022. float sum = 0.0f;
  1023. for(ic = 0; ic < in_channels / 8; ic++)
  1024. {
  1025. FC_UNROLL_S(1);
  1026. FC_UNROLL_S(2);
  1027. }
  1028. dest[oc] = sum + bias[oc];
  1029. }
  1030. } else
  1031. {
  1032. for(oc = 0; oc < out_channels; oc++)
  1033. {
  1034. const float *c_weights = weights + oc * in_channels;
  1035. float sum = 0.0f;
  1036. for(ic = 0; ic < in_channels; ic++)
  1037. sum += src[ic] * c_weights[ic];
  1038. dest[oc] = sum + bias[oc];
  1039. }
  1040. }
  1041. free(weights);
  1042. free(bias);
  1043. kpu_float_activation(dest, out_channels, arg->act);
  1044. }
  1045. static void kpu_tf_flatten(const kpu_model_tf_flatten_layer_argument_t *arg, kpu_model_context_t *ctx)
  1046. {
  1047. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  1048. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  1049. kpu_model_shape_t in_shape = arg->shape;
  1050. uint32_t oc, oy, ox;
  1051. for(oy = 0; oy < in_shape.height; oy++)
  1052. for(ox = 0; ox < in_shape.width; ox++)
  1053. for(oc = 0; oc < in_shape.channels; oc++)
  1054. *dest++ = src[(oc * in_shape.height + oy) * in_shape.width + ox];
  1055. }
  1056. static void kpu_resize_nearest_neighbor(const kpu_model_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
  1057. {
  1058. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  1059. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  1060. kpu_model_shape_t in_shape = arg->in_shape;
  1061. uint32_t out_width = arg->out_width, out_height = arg->out_height;
  1062. uint32_t oc, oy, ox;
  1063. float height_scale = (float)in_shape.height / out_height;
  1064. float width_scale = (float)in_shape.width / out_width;
  1065. for(oc = 0; oc < in_shape.channels; oc++)
  1066. {
  1067. const float *channel_src = src + in_shape.width * in_shape.height * oc;
  1068. for(oy = 0; oy < out_height; oy++)
  1069. {
  1070. uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
  1071. const float *y_origin = channel_src + in_y * in_shape.width;
  1072. for(ox = 0; ox < out_width; ox++)
  1073. {
  1074. uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
  1075. *dest++ = y_origin[in_x];
  1076. }
  1077. }
  1078. }
  1079. }
  1080. static void kpu_quant_resize_nearest_neighbor(const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *arg, kpu_model_context_t *ctx)
  1081. {
  1082. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  1083. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
  1084. kpu_model_shape_t in_shape = arg->in_shape;
  1085. uint32_t out_width = arg->out_width, out_height = arg->out_height;
  1086. uint32_t oc, oy, ox;
  1087. float height_scale = (float)in_shape.height / out_height;
  1088. float width_scale = (float)in_shape.width / out_width;
  1089. for(oc = 0; oc < in_shape.channels; oc++)
  1090. {
  1091. const uint8_t *channel_src = src + in_shape.width * in_shape.height * oc;
  1092. for(oy = 0; oy < out_height; oy++)
  1093. {
  1094. uint32_t in_y = (uint32_t)min(floorf(oy * height_scale), in_shape.height - 1);
  1095. const uint8_t *y_origin = channel_src + in_y * in_shape.width;
  1096. for(ox = 0; ox < out_width; ox++)
  1097. {
  1098. uint32_t in_x = (uint32_t)min(floorf(ox * width_scale), in_shape.width - 1);
  1099. *dest++ = y_origin[in_x];
  1100. }
  1101. }
  1102. }
  1103. }
  1104. static void kpu_logistic(const kpu_model_logistic_layer_argument_t *arg, kpu_model_context_t *ctx)
  1105. {
  1106. const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
  1107. float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
  1108. size_t oc, channels = arg->channels;
  1109. for(oc = 0; oc < channels; oc++)
  1110. dest[oc] = 1.f / (1.f + expf(-src[oc]));
  1111. }
  1112. static void kpu_conv(const kpu_model_conv_layer_argument_t *arg, kpu_model_context_t *ctx)
  1113. {
  1114. volatile kpu_layer_argument_t layer = *(const volatile kpu_layer_argument_t *)(ctx->model_buffer + arg->layer_offset);
  1115. layer.kernel_load_cfg.data.para_start_addr = (uintptr_t)(ctx->model_buffer + arg->weights_offset) - IOMEM;
  1116. layer.kernel_pool_type_cfg.data.bwsx_base_addr = (uintptr_t)(ctx->model_buffer + arg->bn_offset) - IOMEM;
  1117. layer.kernel_calc_type_cfg.data.active_addr = (uintptr_t)(ctx->model_buffer + arg->act_offset) - IOMEM;
  1118. if(arg->flags & KLF_MAIN_MEM_OUT)
  1119. {
  1120. dmac_channel_number_t dma_ch = ctx->dma_ch;
  1121. uint8_t *dest = ctx->main_buffer + arg->main_mem_out_address;
  1122. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  1123. .calc_done_int = 1,
  1124. .layer_cfg_almost_empty_int = 1,
  1125. .layer_cfg_almost_full_int = 1};
  1126. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  1127. .calc_done_int = 1,
  1128. .layer_cfg_almost_empty_int = 1,
  1129. .layer_cfg_almost_full_int = 1};
  1130. layer.dma_parameter.data.send_data_out = 1;
  1131. sysctl_dma_select(dma_ch, SYSCTL_DMA_SELECT_AI_RX_REQ);
  1132. if(ctx->current_layer != ctx->layers_length)
  1133. dmac_set_irq(dma_ch, ai_step, ctx, 1);
  1134. else
  1135. dmac_set_irq(dma_ch, (plic_irq_callback_t)kpu_kmodel_done, ctx, 1);
  1136. dmac_set_single_mode(dma_ch, (void *)(&kpu->fifo_data_out), dest, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
  1137. DMAC_MSIZE_8, DMAC_TRANS_WIDTH_64, (layer.dma_parameter.data.dma_total_byte + 8) / 8);
  1138. } else
  1139. {
  1140. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  1141. .calc_done_int = 1,
  1142. .layer_cfg_almost_empty_int = 1,
  1143. .layer_cfg_almost_full_int = 1};
  1144. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  1145. .calc_done_int = 0,
  1146. .layer_cfg_almost_empty_int = 1,
  1147. .layer_cfg_almost_full_int = 1};
  1148. layer.interrupt_enabe.data.int_en = 1;
  1149. }
  1150. kpu_send_layer((const kpu_layer_argument_t *)&layer);
  1151. }
  1152. static void kpu_add_padding(const kpu_model_add_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
  1153. {
  1154. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  1155. #if USE_CACHED_AI_RAM
  1156. uint8_t *dest = (uint8_t *)(uintptr_t)(AI_RAM_BASE_ADDR + arg->kpu_mem_out_address * 64);
  1157. #else
  1158. uint8_t *dest = (uint8_t *)(uintptr_t)(AI_IO_BASE_ADDR + arg->kpu_mem_out_address * 64);
  1159. #endif
  1160. uint32_t row_padding = 16;
  1161. uint32_t row_group = 4;
  1162. uint32_t row_length = 1;
  1163. uint32_t height = 4;
  1164. uint32_t oc, x, y, channels = arg->channels;
  1165. for(oc = 0; oc < channels; oc++)
  1166. {
  1167. uint8_t *channel_origin = dest + oc / row_group * row_length * height * 64 + oc % row_group * row_padding;
  1168. for(y = 0; y < 1; y++)
  1169. {
  1170. uint8_t *y_origin = channel_origin + y * row_length * 64;
  1171. for(x = 0; x < 1; x++)
  1172. y_origin[x] = *src++;
  1173. }
  1174. }
  1175. #if USE_CACHED_AI_RAM
  1176. uint32_t lines = row_length * height * channels / row_group;
  1177. kpu_flush_cache(arg->kpu_mem_out_address, lines);
  1178. #endif
  1179. }
  1180. static void kpu_remove_padding(const kpu_model_remove_padding_layer_argument_t *arg, kpu_model_context_t *ctx)
  1181. {
  1182. const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
  1183. uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->main_mem_out_address);
  1184. uint32_t oc, channels = arg->channels;
  1185. for(oc = 0; oc < channels; oc++)
  1186. *dest++ = src[oc * 16];
  1187. }
  1188. static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_context_t *ctx)
  1189. {
  1190. size_t width = arg->width;
  1191. size_t height = arg->height;
  1192. size_t channels = arg->channels;
  1193. kpu_upload_core(width, height, channels, ctx->main_buffer + arg->main_mem_in_address, arg->kpu_mem_out_address);
  1194. }
  1195. int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
  1196. {
  1197. #if FIX_CACHE
  1198. configASSERT(is_memory_cache((uintptr_t)buffer));
  1199. #endif
  1200. uintptr_t base_addr = (uintptr_t)buffer;
  1201. const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
  1202. if(header->version == 3 && header->arch == 0)
  1203. {
  1204. ctx->is_nncase = 0;
  1205. ctx->model_buffer = buffer;
  1206. ctx->output_count = header->output_count;
  1207. ctx->outputs = (const kpu_model_output_t *)(base_addr + sizeof(kpu_kmodel_header_t));
  1208. ctx->layer_headers = (const kpu_model_layer_header_t *)((uintptr_t)ctx->outputs + sizeof(kpu_model_output_t) * ctx->output_count);
  1209. ctx->layers_length = header->layers_length;
  1210. ctx->body_start = (const uint8_t *)((uintptr_t)ctx->layer_headers + sizeof(kpu_model_layer_header_t) * header->layers_length);
  1211. ctx->main_buffer = (uint8_t *)malloc(header->main_mem_usage);
  1212. if(!ctx->main_buffer)
  1213. return -1;
  1214. uint32_t body_size = 0;
  1215. for(int i=0; i<ctx->layers_length; i++)
  1216. {
  1217. const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + i;
  1218. body_size += cnt_layer_header->body_size;
  1219. }
  1220. uint8_t *body_start_iomem = (uint8_t *)((uintptr_t)ctx->body_start - IOMEM);
  1221. const uint8_t *body_start_cache = ctx->body_start;
  1222. memcpy(body_start_iomem, body_start_cache, body_size);
  1223. for(int i=0; i<body_size; i++)
  1224. {
  1225. configASSERT(body_start_iomem[i] == body_start_cache[i]);
  1226. }
  1227. } else if(header->version == 'KMDL')
  1228. {
  1229. return nncase_load_kmodel(ctx, buffer);
  1230. } else
  1231. {
  1232. return -1;
  1233. }
  1234. return 0;
  1235. }
  1236. int kpu_get_output(kpu_model_context_t *ctx, uint32_t index, uint8_t **data, size_t *size)
  1237. {
  1238. if(ctx->is_nncase)
  1239. return nncase_get_output(ctx, index, data, size);
  1240. if(index >= ctx->output_count)
  1241. return -1;
  1242. const kpu_model_output_t *output = ctx->outputs + index;
  1243. *data = ctx->main_buffer + output->address;
  1244. *size = output->size;
  1245. return 0;
  1246. }
  1247. void kpu_model_free(kpu_model_context_t *ctx)
  1248. {
  1249. if(ctx->is_nncase)
  1250. return nncase_model_free(ctx);
  1251. free(ctx->main_buffer);
  1252. ctx->main_buffer = NULL;
  1253. }
  1254. #if KPU_DEBUG
  1255. static uint64_t last_time;
  1256. static uint64_t total_time;
  1257. static uint64_t kpu_time;
  1258. static uint32_t last_layer_type;
  1259. static const char *str_layer_type(uint32_t type)
  1260. {
  1261. switch(type)
  1262. {
  1263. case KL_ADD:
  1264. return "Add";
  1265. case KL_QUANTIZED_ADD:
  1266. return "QuantAdd";
  1267. case KL_GLOBAL_AVERAGE_POOL2D:
  1268. return "GAP";
  1269. case KL_QUANTIZED_MAX_POOL2D:
  1270. return "QuantMaxPool2d";
  1271. case KL_AVERAGE_POOL2D:
  1272. return "AveragePool2d";
  1273. case KL_QUANTIZE:
  1274. return "Quantize";
  1275. case KL_DEQUANTIZE:
  1276. return "Dequantize";
  1277. case KL_REQUANTIZE:
  1278. return "Requantize";
  1279. case KL_L2_NORMALIZATION:
  1280. return "L2Norm";
  1281. case KL_SOFTMAX:
  1282. return "Softmax";
  1283. case KL_CONCAT:
  1284. return "Concat";
  1285. case KL_QUANTIZED_CONCAT:
  1286. return "QuantConcat";
  1287. case KL_FULLY_CONNECTED:
  1288. return "FullyConnected";
  1289. case KL_TENSORFLOW_FLATTEN:
  1290. return "TFFlatten";
  1291. case KL_RESIZE_NEAREST_NEIGHBOR:
  1292. return "ResizeNearestNeighbor";
  1293. case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
  1294. return "QuantResizeNearestNeighbor";
  1295. case KL_CHANNELWISE_DEQUANTIZE:
  1296. return "ChannelwiseDequantize";
  1297. case KL_LOGISTIC:
  1298. return "Logistic";
  1299. case KL_K210_CONV:
  1300. return "K210Conv";
  1301. case KL_K210_ADD_PADDING:
  1302. return "K210AddPad";
  1303. case KL_K210_REMOVE_PADDING:
  1304. return "K210RemovePad";
  1305. case KL_K210_UPLOAD:
  1306. return "K210Upload";
  1307. default:
  1308. return "Unknown";
  1309. }
  1310. }
  1311. #endif
  1312. static int kpu_kmodel_done(kpu_model_context_t *ctx)
  1313. {
  1314. kpu->interrupt_clear.data = (kpu_config_interrupt_t){
  1315. .calc_done_int = 1,
  1316. .layer_cfg_almost_empty_int = 1,
  1317. .layer_cfg_almost_full_int = 1};
  1318. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  1319. .calc_done_int = 1,
  1320. .layer_cfg_almost_empty_int = 1,
  1321. .layer_cfg_almost_full_int = 1};
  1322. #if KPU_DEBUG
  1323. uint32_t cnt_layer_id = ctx->current_layer - 1;
  1324. uint64_t time = sysctl_get_time_us();
  1325. if(last_time != 0)
  1326. {
  1327. uint64_t layer_time = time - last_time;
  1328. printf("layer %d [%s]: %f ms\n", cnt_layer_id, str_layer_type(last_layer_type), layer_time / 1000.0);
  1329. total_time += layer_time;
  1330. if(last_layer_type == KL_K210_CONV)
  1331. kpu_time += layer_time;
  1332. }
  1333. printf("KPU: %f ms\n", kpu_time / 1000.0);
  1334. printf("CPU: %f ms\n", (total_time - kpu_time) / 1000.0);
  1335. printf("Model: %f ms\n", total_time / 1000.0);
  1336. #endif
  1337. ctx->done_callback(ctx->userdata);
  1338. return 0;
  1339. }
  1340. static int ai_step(void *userdata)
  1341. {
  1342. kpu_model_context_t *ctx = (kpu_model_context_t *)userdata;
  1343. uint32_t cnt_layer_id = ctx->current_layer++;
  1344. const uint8_t *layer_body = ctx->current_body;
  1345. const kpu_model_layer_header_t *cnt_layer_header = ctx->layer_headers + cnt_layer_id;
  1346. ctx->current_body += cnt_layer_header->body_size;
  1347. #if KPU_DEBUG
  1348. uint64_t time = sysctl_get_time_us();
  1349. if(last_time != 0)
  1350. {
  1351. uint64_t layer_time = time - last_time;
  1352. printf("layer %d [%s]: %f ms\n", cnt_layer_id - 1, str_layer_type(last_layer_type), layer_time / 1000.0);
  1353. total_time += layer_time;
  1354. if(last_layer_type == KL_K210_CONV)
  1355. kpu_time += layer_time;
  1356. }
  1357. last_layer_type = cnt_layer_header->type;
  1358. last_time = sysctl_get_time_us();
  1359. #endif
  1360. switch(cnt_layer_header->type)
  1361. {
  1362. case KL_ADD:
  1363. kpu_kmodel_add((const kpu_model_add_layer_argument_t *)layer_body, ctx);
  1364. break;
  1365. case KL_QUANTIZED_ADD:
  1366. kpu_quantized_add((const kpu_model_quant_add_layer_argument_t *)layer_body, ctx);
  1367. break;
  1368. case KL_GLOBAL_AVERAGE_POOL2D:
  1369. kpu_global_average_pool2d((const kpu_model_gap2d_layer_argument_t *)layer_body, ctx);
  1370. break;
  1371. case KL_QUANTIZED_MAX_POOL2D:
  1372. kpu_quantized_max_pool2d((const kpu_model_quant_max_pool2d_layer_argument_t *)layer_body, ctx);
  1373. break;
  1374. case KL_AVERAGE_POOL2D:
  1375. kpu_average_pool2d((const kpu_model_ave_pool2d_layer_argument_t *)layer_body, ctx);
  1376. break;
  1377. case KL_QUANTIZE:
  1378. kpu_quantize((const kpu_model_quantize_layer_argument_t *)layer_body, ctx);
  1379. break;
  1380. case KL_DEQUANTIZE:
  1381. kpu_kmodel_dequantize((const kpu_model_dequantize_layer_argument_t *)layer_body, ctx);
  1382. break;
  1383. case KL_REQUANTIZE:
  1384. kpu_requantize((const kpu_model_requantize_layer_argument_t *)layer_body, ctx);
  1385. break;
  1386. case KL_L2_NORMALIZATION:
  1387. kpu_l2_normalization((const kpu_model_l2_norm_layer_argument_t *)layer_body, ctx);
  1388. break;
  1389. case KL_SOFTMAX:
  1390. kpu_softmax((const kpu_model_softmax_layer_argument_t *)layer_body, ctx);
  1391. break;
  1392. case KL_CONCAT:
  1393. case KL_QUANTIZED_CONCAT:
  1394. kpu_concat((const kpu_model_concat_layer_argument_t *)layer_body, ctx);
  1395. break;
  1396. case KL_FULLY_CONNECTED:
  1397. kpu_kmodel_fully_connected((const kpu_model_fully_connected_layer_argument_t *)layer_body, ctx);
  1398. break;
  1399. case KL_TENSORFLOW_FLATTEN:
  1400. kpu_tf_flatten((const kpu_model_tf_flatten_layer_argument_t *)layer_body, ctx);
  1401. break;
  1402. case KL_RESIZE_NEAREST_NEIGHBOR:
  1403. kpu_resize_nearest_neighbor((const kpu_model_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
  1404. break;
  1405. case KL_QUANTIZED_RESIZE_NEAREST_NEIGHBOR:
  1406. kpu_quant_resize_nearest_neighbor((const kpu_model_quant_resize_nearest_neighbor_layer_argument_t *)layer_body, ctx);
  1407. break;
  1408. case KL_CHANNELWISE_DEQUANTIZE:
  1409. kpu_kmodel_channelwise_dequantize((const kpu_model_channelwise_dequant_argument_t *)layer_body, ctx);
  1410. break;
  1411. case KL_LOGISTIC:
  1412. kpu_logistic((const kpu_model_logistic_layer_argument_t *)layer_body, ctx);
  1413. break;
  1414. case KL_K210_CONV:
  1415. kpu_conv((const kpu_model_conv_layer_argument_t *)layer_body, ctx);
  1416. return 0;
  1417. case KL_K210_ADD_PADDING:
  1418. kpu_add_padding((const kpu_model_add_padding_layer_argument_t *)layer_body, ctx);
  1419. break;
  1420. case KL_K210_REMOVE_PADDING:
  1421. kpu_remove_padding((const kpu_model_remove_padding_layer_argument_t *)layer_body, ctx);
  1422. break;
  1423. case KL_K210_UPLOAD:
  1424. kpu_upload((const kpu_model_upload_layer_argument_t *)layer_body, ctx);
  1425. break;
  1426. default:
  1427. assert(!"Layer is not supported.");
  1428. }
  1429. if(cnt_layer_id != (ctx->layers_length - 1))
  1430. ai_step(userdata);
  1431. else
  1432. kpu_kmodel_done(ctx);
  1433. return 0;
  1434. }
  1435. static void ai_step_not_isr(void *userdata)
  1436. {
  1437. sysctl_disable_irq();
  1438. ai_step(userdata);
  1439. sysctl_enable_irq();
  1440. }
  1441. int kpu_run_kmodel(kpu_model_context_t *ctx, const uint8_t *src, dmac_channel_number_t dma_ch, kpu_done_callback_t done_callback, void *userdata)
  1442. {
  1443. if(ctx->is_nncase)
  1444. return nncase_run_kmodel(ctx, src, dma_ch, done_callback, userdata);
  1445. ctx->dma_ch = dma_ch;
  1446. ctx->done_callback = done_callback;
  1447. ctx->userdata = userdata;
  1448. ctx->current_layer = 0;
  1449. ctx->current_body = ctx->body_start;
  1450. #if KPU_DEBUG
  1451. last_time = 0;
  1452. total_time = 0;
  1453. kpu_time = 0;
  1454. #endif
  1455. kpu_kmodel_header_t *header = (kpu_kmodel_header_t *)ctx->model_buffer;
  1456. kpu->interrupt_clear.reg = 7;
  1457. kpu->fifo_threshold.data = (kpu_config_fifo_threshold_t){
  1458. .fifo_full_threshold = 10, .fifo_empty_threshold = 1};
  1459. kpu->eight_bit_mode.data = (kpu_config_eight_bit_mode_t){
  1460. .eight_bit_mode = header->flags & 1};
  1461. kpu->interrupt_mask.data = (kpu_config_interrupt_t){
  1462. .calc_done_int = 1,
  1463. .layer_cfg_almost_empty_int = 0,
  1464. .layer_cfg_almost_full_int = 1};
  1465. plic_set_priority(IRQN_AI_INTERRUPT, 1);
  1466. plic_irq_register(IRQN_AI_INTERRUPT, ai_step, ctx);
  1467. plic_irq_enable(IRQN_AI_INTERRUPT);
  1468. const kpu_model_layer_header_t *first_layer_header = ctx->layer_headers;
  1469. switch(first_layer_header->type)
  1470. {
  1471. case KL_K210_CONV:
  1472. {
  1473. const kpu_model_conv_layer_argument_t *first_layer = (const kpu_model_conv_layer_argument_t *)ctx->body_start;
  1474. kpu_layer_argument_t layer_arg = *(volatile kpu_layer_argument_t *)(ctx->model_buffer + first_layer->layer_offset);
  1475. if((layer_arg.image_size.data.i_row_wid + 1) % 64 != 0)
  1476. {
  1477. kpu_kmodel_input_with_padding(&layer_arg, src);
  1478. ai_step_not_isr(ctx);
  1479. } else
  1480. {
  1481. kpu_input_dma(&layer_arg, src, ctx->dma_ch, ai_step, ctx);
  1482. }
  1483. }
  1484. break;
  1485. case KL_FULLY_CONNECTED:
  1486. {
  1487. const kpu_model_fully_connected_layer_argument_t *first_layer = (const kpu_model_fully_connected_layer_argument_t *)ctx->body_start;
  1488. kpu_kmodel_input_float((const float *)src, (float *)(ctx->main_buffer + first_layer->main_mem_in_address), first_layer->in_channels);
  1489. ai_step_not_isr(ctx);
  1490. }
  1491. break;
  1492. default:
  1493. return -1;
  1494. }
  1495. return 0;
  1496. }