xiangbingj před 6 roky
rodič
revize
4f73860ea2

+ 10 - 0
lib/bsp/syscalls.c

@@ -97,6 +97,8 @@ static const char *TAG = "SYSCALL";
 extern char _heap_start[];
 extern char _heap_end[];
 char *_heap_cur = &_heap_start[0];
+char *_heap_line = &_heap_start[0];
+char *_ioheap_line = &_heap_end[0]-0x40000000;
 
 sys_putchar_t sys_putchar;
 sys_getchar_t sys_getchar;
@@ -184,6 +186,14 @@ static size_t sys_brk(size_t pos)
             res = -ENOMEM;
         } else
         {
+            if((uintptr_t)pos > (uintptr_t)_heap_line)
+            {
+                _heap_line = (char *)(uintptr_t)pos;
+                if((uintptr_t)_heap_line-0x40000000 > (uintptr_t)_ioheap_line)
+                {
+                    LOGE(TAG, "WARNING: cache heap line > iomem heap line!\r\n");
+                }
+            }
             /* Adjust brk pointer. */
             _heap_cur = (char *)(uintptr_t)pos;
             /* Return current address. */

+ 80 - 0
lib/drivers/dmac.c

@@ -15,18 +15,27 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 #include "dmac.h"
 #include "fpioa.h"
 #include "plic.h"
 #include "stdlib.h"
 #include "sysctl.h"
 #include "utils.h"
+#include "iomem.h"
 
 volatile dmac_t *const dmac = (dmac_t *)DMAC_BASE_ADDR;
 
 typedef struct _dmac_context
 {
     dmac_channel_number_t dmac_channel;
+#if FIX_CACHE
+    uint8_t *dest_buffer;
+    uint8_t *src_malloc;
+    uint8_t *dest_malloc;
+    size_t buf_len;
+#endif
+
     plic_irq_callback_t callback;
     void *ctx;
 } dmac_context_t;
@@ -353,6 +362,40 @@ int dmac_set_channel_param(dmac_channel_number_t channel_num,
     dmac_ch_ctl_u_t ctl;
     dmac_ch_cfg_u_t cfg_u;
 
+#if FIX_CACHE
+    uint8_t *src_io = (uint8_t *)src;
+    uint8_t *dest_io = (uint8_t *)dest;
+    if(is_memory_cache((uintptr_t)src))
+    {
+        if(src_inc == DMAC_ADDR_NOCHANGE)
+        {
+            src_io = (uint8_t *)iomem_malloc(1<<dmac_trans_width);
+            memcpy(src_io, src, 1<<dmac_trans_width);
+        }
+        else
+        {
+            src_io = (uint8_t *)iomem_malloc(blockSize * (1<<dmac_trans_width));
+            memcpy(src_io, src, blockSize * (1<<dmac_trans_width));
+        }
+        dmac_context[channel_num].src_malloc = src_io;
+    }
+    if(is_memory_cache((uintptr_t)dest))
+    {
+        if(dest_inc == DMAC_ADDR_NOCHANGE)
+        {
+            dest_io = (uint8_t *)iomem_malloc(1<<dmac_trans_width);
+            dmac_context[channel_num].buf_len = 1<<dmac_trans_width;
+        }
+        else
+        {
+            dest_io = (uint8_t *)iomem_malloc(blockSize * (1<<dmac_trans_width));
+            dmac_context[channel_num].buf_len = blockSize * (1<<dmac_trans_width);
+        }
+        dmac_context[channel_num].dest_malloc = dest_io;
+        dmac_context[channel_num].dest_buffer = dest;
+    }
+#endif
+
     int mem_type_src = is_memory((uintptr_t)src), mem_type_dest = is_memory((uintptr_t)dest);
     dmac_transfer_flow_t flow_control;
     if(mem_type_src == 0 && mem_type_dest == 0)
@@ -381,8 +424,13 @@ int dmac_set_channel_param(dmac_channel_number_t channel_num,
 
     writeq(cfg_u.data, &dmac->channel[channel_num].cfg);
 
+#if FIX_CACHE
+    dmac->channel[channel_num].sar = (uint64_t)src_io;
+    dmac->channel[channel_num].dar = (uint64_t)dest_io;
+#else
     dmac->channel[channel_num].sar = (uint64_t)src;
     dmac->channel[channel_num].dar = (uint64_t)dest;
+#endif
 
     ctl.data = readq(&dmac->channel[channel_num].ctl);
     ctl.ch_ctl.sms = DMAC_MASTER1;
@@ -736,6 +784,22 @@ int dmac_is_done(dmac_channel_number_t channel_num)
 void dmac_wait_done(dmac_channel_number_t channel_num)
 {
     dmac_wait_idle(channel_num);
+#if FIX_CACHE
+    if(dmac_context[channel_num].dest_buffer)
+    {
+        memcpy(dmac_context[channel_num].dest_buffer, dmac_context[channel_num].dest_malloc, dmac_context[channel_num].buf_len);
+
+        iomem_free(dmac_context[channel_num].dest_malloc);
+        dmac_context[channel_num].dest_malloc = NULL;
+        dmac_context[channel_num].dest_buffer = NULL;
+        dmac_context[channel_num].buf_len = 0;
+    }
+    if(dmac_context[channel_num].src_malloc)
+    {
+        iomem_free(dmac_context[channel_num].src_malloc);
+        dmac_context[channel_num].src_malloc = NULL;
+    }
+#endif
 }
 
 int dmac_is_idle(dmac_channel_number_t channel_num)
@@ -771,6 +835,22 @@ static int dmac_irq_callback(void *ctx)
     dmac_context_t *v_dmac_context = (dmac_context_t *)(ctx);
     dmac_channel_number_t v_dmac_channel = v_dmac_context->dmac_channel;
     dmac_channel_interrupt_clear(v_dmac_channel);
+#if FIX_CACHE
+    if(v_dmac_context->dest_buffer)
+    {
+        memcpy(v_dmac_context->dest_buffer, v_dmac_context->dest_malloc, v_dmac_context->buf_len);
+        iomem_free(v_dmac_context->dest_malloc);
+        v_dmac_context->dest_malloc = NULL;
+        v_dmac_context->dest_buffer = NULL;
+        v_dmac_context->buf_len = 0;
+    }
+    if(v_dmac_context->src_malloc)
+    {
+        iomem_free(v_dmac_context->src_malloc);
+        v_dmac_context->src_malloc = NULL;
+    }
+#endif
+
     if(v_dmac_context->callback != NULL)
         v_dmac_context->callback(v_dmac_context->ctx);
 

+ 8 - 0
lib/drivers/dvp.c

@@ -201,6 +201,11 @@ void dvp_set_image_size(uint32_t width, uint32_t height)
 
 void dvp_set_ai_addr(uint32_t r_addr, uint32_t g_addr, uint32_t b_addr)
 {
+#if FIX_CACHE
+    configASSERT(!is_memory_cache((uintptr_t)r_addr));
+    configASSERT(!is_memory_cache((uintptr_t)g_addr));
+    configASSERT(!is_memory_cache((uintptr_t)b_addr));
+#endif
     dvp->r_addr = r_addr;
     dvp->g_addr = g_addr;
     dvp->b_addr = b_addr;
@@ -208,6 +213,9 @@ void dvp_set_ai_addr(uint32_t r_addr, uint32_t g_addr, uint32_t b_addr)
 
 void dvp_set_display_addr(uint32_t addr)
 {
+#if FIX_CACHE
+    configASSERT(!is_memory_cache((uintptr_t)addr));
+#endif
     dvp->rgb_addr = addr;
 }
 

+ 18 - 3
lib/drivers/i2c.c

@@ -21,6 +21,7 @@
 #include "string.h"
 #include "sysctl.h"
 #include "utils.h"
+#include "iomem.h"
 
 typedef struct _i2c_slave_instance
 {
@@ -167,7 +168,11 @@ void i2c_send_data_dma(dmac_channel_number_t dma_channel_num, i2c_device_number_
     configASSERT(i2c_num < I2C_MAX_NUM);
     volatile i2c_t *i2c_adapter = i2c[i2c_num];
     i2c_adapter->clr_tx_abrt = i2c_adapter->clr_tx_abrt;
+#if FIX_CACHE
+    uint32_t *buf = iomem_malloc(send_buf_len * sizeof(uint32_t));
+#else
     uint32_t *buf = malloc(send_buf_len * sizeof(uint32_t));
+#endif
     int i;
     for(i = 0; i < send_buf_len; i++)
     {
@@ -179,7 +184,11 @@ void i2c_send_data_dma(dmac_channel_number_t dma_channel_num, i2c_device_number_
                          DMAC_MSIZE_4, DMAC_TRANS_WIDTH_32, send_buf_len);
 
     dmac_wait_done(dma_channel_num);
+#if FIX_CACHE
+    iomem_free((void *)buf);
+#else
     free((void *)buf);
+#endif
 
     while((i2c_adapter->status & I2C_STATUS_ACTIVITY) || !(i2c_adapter->status & I2C_STATUS_TFE))
     {
@@ -233,8 +242,11 @@ void i2c_recv_data_dma(dmac_channel_number_t dma_send_channel_num, dmac_channel_
     configASSERT(i2c_num < I2C_MAX_NUM);
 
     volatile i2c_t *i2c_adapter = i2c[i2c_num];
-
+#if FIX_CACHE
+    uint32_t *write_cmd = iomem_malloc(sizeof(uint32_t) * (send_buf_len + receive_buf_len));
+#else
     uint32_t *write_cmd = malloc(sizeof(uint32_t) * (send_buf_len + receive_buf_len));
+#endif
     size_t i;
     for(i = 0; i < send_buf_len; i++)
         write_cmd[i] = *send_buf++;
@@ -257,8 +269,11 @@ void i2c_recv_data_dma(dmac_channel_number_t dma_send_channel_num, dmac_channel_
     {
         receive_buf[i] = (uint8_t)write_cmd[i];
     }
-
-    free(write_cmd);
+#if FIX_CACHE
+        iomem_free(write_cmd);
+#else
+        free(write_cmd);
+#endif
 }
 
 static int i2c_dma_irq(void *ctx)

+ 8 - 0
lib/drivers/include/iomem.h

@@ -0,0 +1,8 @@
+#ifndef _IOMEM_MALLOC_H
+#define _IOMEM_MALLOC_H
+
+void iomem_free(void *paddr) ;
+void *iomem_malloc(uint32_t size);
+uint32_t iomem_unused();
+
+#endif

+ 3 - 0
lib/drivers/include/utils.h

@@ -33,6 +33,8 @@ extern "C" {
 #define KENDRYTE_MIN(a, b) ((a) > (b) ? (b) : (a))
 #define KENDRYTE_MAX(a, b) ((a) > (b) ? (a) : (b))
 
+#define FIX_CACHE 1
+
 #ifdef __ASSEMBLY__
 #define KENDRYTE_CAST(type, ptr) ptr
 #else /* __ASSEMBLY__ */
@@ -340,6 +342,7 @@ uint32_t get_bit(volatile uint32_t *bits, uint32_t mask, size_t offset);
  */
 uint32_t get_gpio_bit(volatile uint32_t *bits, size_t offset);
 
+uint32_t is_memory_cache(uintptr_t address);
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

+ 169 - 0
lib/drivers/iomem.c

@@ -0,0 +1,169 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "iomem.h"
+#include "printf.h"
+#include "atomic.h"
+
+#define IOMEM_BLOCK_SIZE 256
+
+typedef struct _iomem_malloc_t
+{
+    void (*init)();
+    uint32_t (*unused)();
+    uint8_t *membase;
+    uint32_t memsize;
+    uint32_t memtblsize;
+    uint16_t *memmap;
+    uint8_t  memrdy;
+} iomem_malloc_t;
+
+static void iomem_init();
+static uint32_t k_unused();
+extern char *_ioheap_line;
+extern char *_heap_line;
+extern char _heap_start[];
+extern char *_heap_cur;
+
+iomem_malloc_t malloc_cortol = 
+{
+    iomem_init,
+    k_unused,
+    NULL,
+    0,
+    0,
+    NULL,
+    0
+};
+
+static void iomem_set(void *s, uint8_t c, uint32_t num)
+{
+    uint8_t *xs = s;
+    while(num--)
+        *xs++=c;
+}
+
+static void iomem_init()
+{
+    malloc_cortol.membase = (uint8_t *)((uintptr_t)_heap_line-0x40000000);
+    malloc_cortol.memsize = (uint32_t)_ioheap_line - (uint32_t)malloc_cortol.membase;
+
+    malloc_cortol.memtblsize = malloc_cortol.memsize / IOMEM_BLOCK_SIZE;
+    malloc_cortol.memmap = (uint16_t *)malloc(malloc_cortol.memtblsize * 2);
+    mb();
+
+    malloc_cortol.membase = (uint8_t *)((uintptr_t)_heap_line-0x40000000);
+    malloc_cortol.memsize = (uint32_t)_ioheap_line - (uint32_t)malloc_cortol.membase;
+    malloc_cortol.memtblsize = malloc_cortol.memsize / IOMEM_BLOCK_SIZE;
+
+    iomem_set(malloc_cortol.memmap, 0, malloc_cortol.memtblsize * 2);
+    iomem_set(malloc_cortol.membase, 0, malloc_cortol.memsize);
+    malloc_cortol.memrdy = 1;
+}
+
+static uint32_t k_unused()
+{
+    uint32_t unused=0;
+    unused = (uintptr_t)_ioheap_line + 0x40000000 - (uintptr_t)_heap_line;
+
+    return unused;
+}
+
+static uint32_t k_malloc(uint32_t size)
+{
+    signed long offset = 0;
+    uint32_t xmemb;
+    uint32_t kmemb = 0;
+
+    if(!malloc_cortol.memrdy)
+        malloc_cortol.init();
+    if(size==0)
+        return 0XFFFFFFFF;
+    xmemb=size / IOMEM_BLOCK_SIZE;
+    if(size % IOMEM_BLOCK_SIZE)
+        xmemb++;
+    for(offset=malloc_cortol.memtblsize-1; offset>=0; offset--)
+    {
+        if(!malloc_cortol.memmap[offset])
+        {
+            kmemb++;
+        }
+        else 
+        {
+            offset = offset - malloc_cortol.memmap[offset] + 1;
+            kmemb=0;
+        }
+        if(kmemb==xmemb)
+        {
+            malloc_cortol.memmap[offset] = xmemb;
+            malloc_cortol.memmap[offset+xmemb-1] = xmemb;
+            return (offset * IOMEM_BLOCK_SIZE);
+        }
+    }
+    return 0XFFFFFFFF;
+}
+
+static uint8_t k_free(uint32_t offset)
+{
+    if(!malloc_cortol.memrdy)
+    {
+        malloc_cortol.init();
+        return 1;
+    }  
+    if(offset < malloc_cortol.memsize)
+    {  
+        int index=offset / IOMEM_BLOCK_SIZE;
+        int nmemb=malloc_cortol.memmap[index];
+
+        malloc_cortol.memmap[index] = 0;
+        malloc_cortol.memmap[index+nmemb-1] = 0;
+
+        if((uintptr_t)_ioheap_line == (uintptr_t)malloc_cortol.membase + offset)
+        {
+            _ioheap_line = (char *)((uintptr_t)_ioheap_line + nmemb * IOMEM_BLOCK_SIZE);
+        }
+        return 0;
+    }
+    else 
+        return 2;
+}  
+
+void iomem_free(void *paddr)
+{
+    uint32_t offset;
+    if(paddr == NULL)
+        return;
+    offset=(uintptr_t)paddr - (uintptr_t)malloc_cortol.membase;
+    k_free(offset);
+}
+
+void *iomem_malloc(uint32_t size)
+{
+    uint32_t offset;
+    offset=k_malloc(size);
+    if(offset == 0XFFFFFFFF)
+    {
+        printk("IOMEM malloc OUT of MEMORY!\r\n");
+         return NULL;
+    }
+    else 
+    {
+        if((uintptr_t)_ioheap_line > (uintptr_t)malloc_cortol.membase + offset)
+        {
+            _ioheap_line = (char *)((uintptr_t)malloc_cortol.membase + offset);
+            if((uintptr_t)_ioheap_line < (uintptr_t)_heap_line-0x40000000)
+            {
+                printk("WARNING: iomem heap line < cache heap line!\r\n");
+            }
+        };
+
+        return (void*)((uintptr_t)malloc_cortol.membase + offset);
+    }
+}
+
+uint32_t iomem_unused()
+{
+    return malloc_cortol.unused();
+}
+

+ 17 - 4
lib/drivers/kpu.c

@@ -11,6 +11,7 @@
 #include "kpu.h"
 #include "printf.h"
 #include "nncase.h"
+#include "utils.h"
 
 #define LAYER_BURST_SIZE 12
 
@@ -1004,8 +1005,13 @@ static void kpu_quantize(const kpu_model_quantize_layer_argument_t *arg, kpu_mod
 {
     size_t count = arg->count;
     const float *src = (const float *)(ctx->main_buffer + arg->main_mem_in_address);
-    ;
-    const kpu_model_quant_param_t q = arg->quant_param;
+
+    kpu_model_quant_param_t q;
+#if FIX_CACHE
+    memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
+#else
+    q = arg->quant_param;
+#endif
     float scale = 1.f / q.scale;
 
     uint8_t *dest = (uint8_t *)(ctx->main_buffer + arg->mem_out_address);
@@ -1026,8 +1032,12 @@ static void kpu_kmodel_dequantize(const kpu_model_dequantize_layer_argument_t *a
     const uint8_t *src = (const uint8_t *)(ctx->main_buffer + arg->main_mem_in_address);
     float *dest = (float *)(ctx->main_buffer + arg->main_mem_out_address);
     size_t oc, count = arg->count;
-    const kpu_model_quant_param_t q = arg->quant_param;
-
+    kpu_model_quant_param_t q;
+#if FIX_CACHE
+    memcpy(&q, &arg->quant_param, sizeof(kpu_model_quant_param_t));
+#else
+    q = arg->quant_param;
+#endif
     for(oc = 0; oc < count; oc++)
         dest[oc] = *src++ * q.scale + q.bias;
 }
@@ -1357,6 +1367,9 @@ static void kpu_upload(const kpu_model_upload_layer_argument_t *arg, kpu_model_c
 
 int kpu_load_kmodel(kpu_model_context_t *ctx, const uint8_t *buffer)
 {
+#if FIX_CACHE
+    configASSERT(!is_memory_cache((uintptr_t)buffer));
+#endif
     uintptr_t base_addr = (uintptr_t)buffer;
     const kpu_kmodel_header_t *header = (const kpu_kmodel_header_t *)buffer;
 

+ 75 - 4
lib/drivers/spi.c

@@ -22,6 +22,7 @@
 #include "spi.h"
 #include "sysctl.h"
 #include "utils.h"
+#include "iomem.h"
 
 volatile spi_t *const spi[4] =
     {
@@ -411,7 +412,11 @@ void spi_send_data_normal_dma(dmac_channel_number_t channel_num, spi_device_num_
     switch(spi_transfer_width)
     {
         case SPI_TRANS_SHORT:
-            buf = malloc((tx_len) * sizeof(uint32_t));
+#if FIX_CACHE
+            buf = (uint32_t *)iomem_malloc((tx_len) * sizeof(uint32_t));
+#else
+            buf = (uint32_t *)malloc((tx_len) * sizeof(uint32_t));
+#endif
             for(i = 0; i < tx_len; i++)
                 buf[i] = ((uint16_t *)tx_buff)[i];
             break;
@@ -420,7 +425,12 @@ void spi_send_data_normal_dma(dmac_channel_number_t channel_num, spi_device_num_
             break;
         case SPI_TRANS_CHAR:
         default:
-            buf = malloc((tx_len) * sizeof(uint32_t));
+#if FIX_CACHE
+            buf = (uint32_t *)iomem_malloc((tx_len) * sizeof(uint32_t));
+#else
+            buf = (uint32_t *)malloc((tx_len) * sizeof(uint32_t));
+#endif
+
             for(i = 0; i < tx_len; i++)
                 buf[i] = ((uint8_t *)tx_buff)[i];
             break;
@@ -429,13 +439,19 @@ void spi_send_data_normal_dma(dmac_channel_number_t channel_num, spi_device_num_
     spi_handle->ssienr = 0x01;
 
     sysctl_dma_select((sysctl_dma_channel_t)channel_num, SYSCTL_DMA_SELECT_SSI0_TX_REQ + spi_num * 2);
+
     dmac_set_single_mode(channel_num, buf, (void *)(&spi_handle->dr[0]), DMAC_ADDR_INCREMENT, DMAC_ADDR_NOCHANGE,
                          DMAC_MSIZE_4, DMAC_TRANS_WIDTH_32, tx_len);
     spi_handle->ser = 1U << chip_select;
     dmac_wait_done(channel_num);
     if(spi_transfer_width != SPI_TRANS_INT)
+    {
+#if FIX_CACHE
+        iomem_free((void *)buf);
+#else
         free((void *)buf);
-
+#endif
+    }
     while((spi_handle->sr & 0x05) != 0x04)
         ;
     spi_handle->ser = 0x00;
@@ -472,8 +488,13 @@ void spi_dup_send_receive_data_dma(dmac_channel_number_t dma_send_channel_num,
 
     size_t v_max_len = v_tx_len > v_rx_len ? v_tx_len : v_rx_len;
 
+#if FIX_CACHE
+    uint32_t *v_tx_buf = iomem_malloc(v_max_len * 4);
+    uint32_t *v_rx_buf = iomem_malloc(v_max_len * 4);
+#else
     uint32_t *v_tx_buf = malloc(v_max_len * 4);
     uint32_t *v_rx_buf = malloc(v_max_len * 4);
+#endif
     uint32_t i = 0;
     switch(frame_width)
     {
@@ -552,8 +573,13 @@ void spi_dup_send_receive_data_dma(dmac_channel_number_t dma_send_channel_num,
                 rx_buf[i] = v_rx_buf[i];
             break;
     }
+#if FIX_CACHE
+    iomem_free(v_tx_buf);
+    iomem_free(v_rx_buf);
+#else
     free(v_tx_buf);
     free(v_rx_buf);
+#endif
 }
 
 void spi_receive_data_standard(spi_device_num_t spi_num, spi_chip_select_t chip_select, const uint8_t *cmd_buff,
@@ -721,7 +747,11 @@ void spi_receive_data_standard_dma(dmac_channel_number_t dma_send_channel_num,
     switch(frame_width)
     {
         case SPI_TRANS_INT:
+#if FIX_CACHE
+            write_cmd = iomem_malloc(cmd_len + rx_len);
+#else
             write_cmd = malloc(cmd_len + rx_len);
+#endif
             for(i = 0; i < cmd_len / 4; i++)
                 write_cmd[i] = ((uint32_t *)cmd_buff)[i];
             read_buf = &write_cmd[i];
@@ -729,7 +759,11 @@ void spi_receive_data_standard_dma(dmac_channel_number_t dma_send_channel_num,
             v_cmd_len = cmd_len / 4;
             break;
         case SPI_TRANS_SHORT:
+#if FIX_CACHE
+            write_cmd = iomem_malloc((cmd_len + rx_len) / 2 * sizeof(uint32_t));
+#else
             write_cmd = malloc((cmd_len + rx_len) / 2 * sizeof(uint32_t));
+#endif
             for(i = 0; i < cmd_len / 2; i++)
                 write_cmd[i] = ((uint16_t *)cmd_buff)[i];
             read_buf = &write_cmd[i];
@@ -737,7 +771,11 @@ void spi_receive_data_standard_dma(dmac_channel_number_t dma_send_channel_num,
             v_cmd_len = cmd_len / 2;
             break;
         default:
+#if FIX_CACHE
+            write_cmd = iomem_malloc((cmd_len + rx_len) * sizeof(uint32_t));
+#else
             write_cmd = malloc((cmd_len + rx_len) * sizeof(uint32_t));
+#endif
             for(i = 0; i < cmd_len; i++)
                 write_cmd[i] = cmd_buff[i];
             read_buf = &write_cmd[i];
@@ -763,8 +801,11 @@ void spi_receive_data_standard_dma(dmac_channel_number_t dma_send_channel_num,
                 rx_buff[i] = read_buf[i];
             break;
     }
-
+#if FIX_CACHE
+    iomem_free(write_cmd);
+#else
     free(write_cmd);
+#endif
 }
 
 void spi_receive_data_multiple(spi_device_num_t spi_num, spi_chip_select_t chip_select, const uint32_t *cmd_buff,
@@ -887,14 +928,22 @@ void spi_receive_data_multiple_dma(dmac_channel_number_t dma_send_channel_num,
             v_recv_len = rx_len / 4;
             break;
         case SPI_TRANS_SHORT:
+#if FIX_CACHE
+            write_cmd = iomem_malloc(cmd_len + rx_len / 2 * sizeof(uint32_t));
+#else
             write_cmd = malloc(cmd_len + rx_len / 2 * sizeof(uint32_t));
+#endif
             for(i = 0; i < cmd_len; i++)
                 write_cmd[i] = cmd_buff[i];
             read_buf = &write_cmd[i];
             v_recv_len = rx_len / 2;
             break;
         default:
+#if FIX_CACHE
+            write_cmd = iomem_malloc(cmd_len + rx_len * sizeof(uint32_t));
+#else
             write_cmd = malloc(cmd_len + rx_len * sizeof(uint32_t));
+#endif
             for(i = 0; i < cmd_len; i++)
                 write_cmd[i] = cmd_buff[i];
             read_buf = &write_cmd[i];
@@ -921,7 +970,13 @@ void spi_receive_data_multiple_dma(dmac_channel_number_t dma_send_channel_num,
     }
 
     if(frame_width != SPI_TRANS_INT)
+    {
+#if FIX_CACHE
+        iomem_free(write_cmd);
+#else
         free(write_cmd);
+#endif
+    }
 }
 
 void spi_send_data_multiple(spi_device_num_t spi_num, spi_chip_select_t chip_select, const uint32_t *cmd_buff,
@@ -979,7 +1034,11 @@ void spi_send_data_multiple_dma(dmac_channel_number_t channel_num, spi_device_nu
     switch(frame_width)
     {
         case SPI_TRANS_INT:
+#if FIX_CACHE
+            buf = iomem_malloc(cmd_len * sizeof(uint32_t) + tx_len);
+#else
             buf = malloc(cmd_len * sizeof(uint32_t) + tx_len);
+#endif
             for(i = 0; i < cmd_len; i++)
                 buf[i] = cmd_buff[i];
             for(i = 0; i < tx_len / 4; i++)
@@ -987,7 +1046,11 @@ void spi_send_data_multiple_dma(dmac_channel_number_t channel_num, spi_device_nu
             v_send_len = cmd_len + tx_len / 4;
             break;
         case SPI_TRANS_SHORT:
+#if FIX_CACHE
+            buf = iomem_malloc(cmd_len * sizeof(uint32_t) + tx_len / 2 * sizeof(uint32_t));
+#else
             buf = malloc(cmd_len * sizeof(uint32_t) + tx_len / 2 * sizeof(uint32_t));
+#endif
             for(i = 0; i < cmd_len; i++)
                 buf[i] = cmd_buff[i];
             for(i = 0; i < tx_len / 2; i++)
@@ -995,7 +1058,11 @@ void spi_send_data_multiple_dma(dmac_channel_number_t channel_num, spi_device_nu
             v_send_len = cmd_len + tx_len / 2;
             break;
         default:
+#if FIX_CACHE
+            buf = iomem_malloc((cmd_len + tx_len) * sizeof(uint32_t));
+#else
             buf = malloc((cmd_len + tx_len) * sizeof(uint32_t));
+#endif
             for(i = 0; i < cmd_len; i++)
                 buf[i] = cmd_buff[i];
             for(i = 0; i < tx_len; i++)
@@ -1006,7 +1073,11 @@ void spi_send_data_multiple_dma(dmac_channel_number_t channel_num, spi_device_nu
 
     spi_send_data_normal_dma(channel_num, spi_num, chip_select, buf, v_send_len, SPI_TRANS_INT);
 
+#if FIX_CACHE
+    iomem_free((void *)buf);
+#else
     free((void *)buf);
+#endif
 }
 
 void spi_fill_data_dma(dmac_channel_number_t channel_num, spi_device_num_t spi_num, spi_chip_select_t chip_select,

+ 41 - 4
lib/drivers/uart.c

@@ -20,6 +20,7 @@
 #include "sysctl.h"
 #include "uart.h"
 #include "utils.h"
+#include "iomem.h"
 
 #define __UART_BRATE_CONST 16
 
@@ -158,12 +159,18 @@ static int uart_dma_callback(void *ctx)
         size_t v_buf_len = v_uart_dma_instance->buf_len;
         uint8_t *v_buffer = v_uart_dma_instance->buffer;
         uint32_t *v_recv_buffer = v_uart_dma_instance->malloc_buffer;
+
         for(size_t i = 0; i < v_buf_len; i++)
         {
             v_buffer[i] = v_recv_buffer[i];
         }
     }
+#if FIX_CACHE
+    iomem_free(v_uart_dma_instance->malloc_buffer);
+#else
     free(v_uart_dma_instance->malloc_buffer);
+#endif
+	v_uart_dma_instance->malloc_buffer = NULL;
     if(v_uart_dma_instance->uart_int_instance.callback)
         v_uart_dma_instance->uart_int_instance.callback(v_uart_dma_instance->uart_int_instance.ctx);
     return 0;
@@ -184,27 +191,42 @@ int uart_receive_data(uart_device_number_t channel, char *buffer, size_t buf_len
 
 void uart_receive_data_dma(uart_device_number_t uart_channel, dmac_channel_number_t dmac_channel, uint8_t *buffer, size_t buf_len)
 {
-    uint32_t *v_recv_buf = malloc(buf_len * sizeof(uint32_t));
+#if FIX_CACHE
+    uint32_t *v_recv_buf = (uint32_t *)iomem_malloc(buf_len * sizeof(uint32_t));
+#else
+    uint32_t *v_recv_buf = (uint32_t *)malloc(buf_len * sizeof(uint32_t));
+#endif
     configASSERT(v_recv_buf != NULL);
 
     sysctl_dma_select((sysctl_dma_channel_t)dmac_channel, SYSCTL_DMA_SELECT_UART1_RX_REQ + uart_channel * 2);
+
     dmac_set_single_mode(dmac_channel, (void *)(&uart[uart_channel]->RBR), v_recv_buf, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
                          DMAC_MSIZE_1, DMAC_TRANS_WIDTH_32, buf_len);
+
     dmac_wait_done(dmac_channel);
     for(uint32_t i = 0; i < buf_len; i++)
     {
         buffer[i] = (uint8_t)(v_recv_buf[i] & 0xff);
     }
+#if FIX_CACHE
+    iomem_free(v_recv_buf);
+#else
     free(v_recv_buf);
+#endif
 }
 
 void uart_receive_data_dma_irq(uart_device_number_t uart_channel, dmac_channel_number_t dmac_channel,
                                uint8_t *buffer, size_t buf_len, plic_irq_callback_t uart_callback,
                                void *ctx, uint32_t priority)
 {
-    uint32_t *v_recv_buf = malloc(buf_len * sizeof(uint32_t));
+#if FIX_CACHE
+    uint32_t *v_recv_buf = (uint32_t *)iomem_malloc(buf_len * sizeof(uint32_t));
+#else
+    uint32_t *v_recv_buf = (uint32_t *)malloc(buf_len * sizeof(uint32_t));
+#endif
     configASSERT(v_recv_buf != NULL);
 
+
     uart_recv_dma_instance[uart_channel].dmac_channel = dmac_channel;
     uart_recv_dma_instance[uart_channel].uart_num = uart_channel;
     uart_recv_dma_instance[uart_channel].malloc_buffer = v_recv_buf;
@@ -217,7 +239,7 @@ void uart_receive_data_dma_irq(uart_device_number_t uart_channel, dmac_channel_n
     dmac_irq_register(dmac_channel, uart_dma_callback, &uart_recv_dma_instance[uart_channel], priority);
     sysctl_dma_select((sysctl_dma_channel_t)dmac_channel, SYSCTL_DMA_SELECT_UART1_RX_REQ + uart_channel * 2);
     dmac_set_single_mode(dmac_channel, (void *)(&uart[uart_channel]->RBR), v_recv_buf, DMAC_ADDR_NOCHANGE, DMAC_ADDR_INCREMENT,
-                         DMAC_MSIZE_1, DMAC_TRANS_WIDTH_32, buf_len);
+                        DMAC_MSIZE_1, DMAC_TRANS_WIDTH_32, buf_len);
 }
 
 int uart_send_data(uart_device_number_t channel, const char *buffer, size_t buf_len)
@@ -233,22 +255,37 @@ int uart_send_data(uart_device_number_t channel, const char *buffer, size_t buf_
 
 void uart_send_data_dma(uart_device_number_t uart_channel, dmac_channel_number_t dmac_channel, const uint8_t *buffer, size_t buf_len)
 {
+#if FIX_CACHE
+    uint32_t *v_send_buf = iomem_malloc(buf_len * sizeof(uint32_t));
+#else
     uint32_t *v_send_buf = malloc(buf_len * sizeof(uint32_t));
+#endif
     configASSERT(v_send_buf != NULL);
+
     for(uint32_t i = 0; i < buf_len; i++)
         v_send_buf[i] = buffer[i];
+
     sysctl_dma_select((sysctl_dma_channel_t)dmac_channel, SYSCTL_DMA_SELECT_UART1_TX_REQ + uart_channel * 2);
     dmac_set_single_mode(dmac_channel, v_send_buf, (void *)(&uart[uart_channel]->THR), DMAC_ADDR_INCREMENT, DMAC_ADDR_NOCHANGE,
-                         DMAC_MSIZE_1, DMAC_TRANS_WIDTH_32, buf_len);
+                             DMAC_MSIZE_1, DMAC_TRANS_WIDTH_32, buf_len);
+
     dmac_wait_done(dmac_channel);
+#if FIX_CACHE
+    iomem_free((void *)v_send_buf);
+#else
     free((void *)v_send_buf);
+#endif
 }
 
 void uart_send_data_dma_irq(uart_device_number_t uart_channel, dmac_channel_number_t dmac_channel,
                             const uint8_t *buffer, size_t buf_len, plic_irq_callback_t uart_callback,
                             void *ctx, uint32_t priority)
 {
+#if FIX_CACHE
+    uint32_t *v_send_buf = iomem_malloc(buf_len * sizeof(uint32_t));
+#else
     uint32_t *v_send_buf = malloc(buf_len * sizeof(uint32_t));
+#endif
     configASSERT(v_send_buf != NULL);
 
     uart_send_dma_instance[uart_channel] = (uart_dma_instance_t){

+ 8 - 0
lib/drivers/utils.c

@@ -41,3 +41,11 @@ uint32_t get_gpio_bit(volatile uint32_t *bits, size_t offset)
 {
     return get_bit(bits, 1, offset);
 }
+
+uint32_t is_memory_cache(uintptr_t address)
+{
+    #define MEM_CACHE_LEN (6 * 1024 * 1024)
+
+    return ((address >= 0x80000000) && (address < 0x80000000 + MEM_CACHE_LEN));
+}
+

+ 13 - 1
lib/nncase/include/kernels/neutral/neutral_kernels.h

@@ -17,6 +17,8 @@
 #include <cmath>
 #include <runtime/runtime_op_utility.h>
 #include <xtl/xspan.hpp>
+#include <cstring>
+#include <utils.h>
 
 namespace nncase
 {
@@ -139,11 +141,18 @@ namespace kernels
 
         inline void matmul(const float *input_a, const float *input_b, float *output, const float *bias, int32_t a_rows, int32_t a_cols, int32_t b_cols, const value_range<float> &fused_activation)
         {
+#if FIX_CACHE
+            float *cache_mem = new float[b_cols];
+            memcpy(cache_mem, bias, b_cols*sizeof(float));
+#else
+            const float *cache_mem =bias;
+#endif
             for (size_t oy = 0; oy < a_rows; oy++)
             {
                 for (size_t ox = 0; ox < b_cols; ox++)
                 {
-                    float value = bias[ox];
+                    float value = cache_mem[ox];
+
                     for (size_t i = 0; i < a_cols; i++)
                     {
                         const auto a = input_a[oy * a_cols + i];
@@ -154,6 +163,9 @@ namespace kernels
                     output[oy * b_cols + ox] = details::apply_activation(value, fused_activation);
                 }
             }
+#if FIX_CACHE
+            delete []cache_mem;
+#endif
         }
 
         template <class T>