[feat] add tinymaix component and demo

2025-07-06 12:58:45 +00:00 · 2022-11-18 16:05:54 +08:00 · 2022-11-18 16:05:54 +08:00 · 57bda6c48b
commit 57bda6c48b
parent b46115f175
314 changed files with 309080 additions and 0 deletions
--- a/examples/tinymaix/CMakeLists.txt
+++ b/examples/tinymaix/CMakeLists.txt
@ -0,0 +1,11 @@
+cmake_minimum_required(VERSION 3.15)
+
+include(proj.conf)
+
+find_package(bouffalo_sdk REQUIRED HINTS $ENV{BL_SDK_BASE})
+
+sdk_add_include_directories(.)
+
+sdk_set_main_file(main.c)
+
+project(tinymaix)
--- a/examples/tinymaix/Makefile
+++ b/examples/tinymaix/Makefile
@ -0,0 +1,13 @@
+SDK_DEMO_PATH ?= .
+BL_SDK_BASE ?= $(SDK_DEMO_PATH)/../..
+
+export BL_SDK_BASE
+
+CHIP ?= bl616
+BOARD ?= bl616dk
+CROSS_COMPILE ?= riscv64-unknown-elf-
+
+# add custom cmake definition
+#cmake_definition+=-Dxxx=sss
+
+include $(BL_SDK_BASE)/project.build
--- a/examples/tinymaix/main.c
+++ b/examples/tinymaix/main.c
@ -0,0 +1,34 @@
+#include "bflb_mtimer.h"
+#include "board.h"
+
+#define MODEL_MNIST 1
+#define MODEL_CIFAR10 2
+#define MODEL_VWW 3
+#define MODEL_MBNET 4
+
+#define CONFIG_MODEL MODEL_MNIST
+
+#define main benchmark_main
+#if (CONFIG_MODEL == MODEL_MNIST)
+#include "mnist/main.c"
+#elif (CONFIG_MODEL == MODEL_CIFAR10)
+#include "cifar10/main.c"
+#elif (CONFIG_MODEL == MODEL_VWW)
+#include "vww/main.c"
+#elif (CONFIG_MODEL == MODEL_MBNET)
+#include "mbnet/label.c"
+#include "mbnet/main.c"
+#endif
+#undef main
+
+int main(void)
+{
+    board_init();
+
+    printf("tinymaix test ...\n");
+    benchmark_main(0, NULL);
+
+    while (1) {
+        bflb_mtimer_delay_ms(1000);
+    }
+}
--- a/examples/tinymaix/proj.conf
+++ b/examples/tinymaix/proj.conf
@ -0,0 +1,3 @@
+set(CONFIG_VLIBC 0)
+set(CONFIG_BFLOG 0)
+set(CONFIG_TINYMAIX 1)
--- a/examples/tinymaix/tinymaix.h
+++ b/examples/tinymaix/tinymaix.h
@ -0,0 +1,356 @@
+/* Copyright 2022 Sipeed Technology Co., Ltd. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __TINYMAIX_H
+#define __TINYMAIX_H
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define  TM_MDL_INT8    0
+#define  TM_MDL_INT16   1
+#define  TM_MDL_FP32    2
+#define  TM_MDL_FP16    3
+#define  TM_MDL_FP8_143 4 //experimental
+#define  TM_MDL_FP8_152 5 //experimental
+#include "tm_port.h"
+
+/******************************* MARCO ************************************/
+#define TM_MDL_MAGIC 'XIAM'     //mdl magic sign
+#define TM_ALIGN_SIZE   (8)     //8 byte align
+#define TM_ALIGN(addr)  ((((size_t)(addr))+(TM_ALIGN_SIZE-1))/TM_ALIGN_SIZE*TM_ALIGN_SIZE)
+#define TM_MATP(mat,y,x,ch) ((mat)->data + ((y)*(mat)->w + (x))*(mat)->c + (ch))
+                                //HWC
+#if   TM_MDL_TYPE == TM_MDL_INT8
+    typedef int8_t  mtype_t;    //mat data type
+    typedef int8_t  wtype_t;    //weight data type
+    typedef int32_t btype_t;    //bias data type
+    typedef int32_t sumtype_t;  //sum data type 
+    typedef int32_t zptype_t;   //zeropoint data type 
+    #define UINT2INT_SHIFT (0)
+#elif TM_MDL_TYPE == TM_MDL_INT16
+    typedef int16_t mtype_t;    //mat data type
+    typedef int16_t wtype_t;    //weight data type
+    typedef int32_t btype_t;    //bias data type
+    typedef int32_t sumtype_t;  //sum data type 
+    typedef int32_t zptype_t;   //zeropoint data type
+    #define UINT2INT_SHIFT (8)
+#elif TM_MDL_TYPE == TM_MDL_FP32
+    typedef float   mtype_t;    //mat data type
+    typedef float   wtype_t;    //weight data type
+    typedef float   btype_t;    //bias data type
+    typedef float   sumtype_t;  //sum data type 
+    typedef float   zptype_t;   //zeropoint data type 
+#elif TM_MDL_TYPE == TM_MDL_FP16
+    #if TM_ARCH != TM_ARCH_RV64V
+        #error "only support RV64V's float16!"
+    #endif
+    #include <riscv_vector.h>
+    typedef float16_t mtype_t;    //mat data type
+    typedef float16_t wtype_t;    //weight data type
+    typedef float16_t btype_t;    //bias data type
+    typedef float16_t sumtype_t;  //sum data type 
+    typedef float16_t zptype_t;   //zeropoint data type
+#elif (TM_MDL_TYPE == TM_MDL_FP8_143) || (TM_MDL_TYPE == TM_MDL_FP8_152)
+    #if TM_ARCH != TM_ARCH_CPU
+        #error "only support CPU simulation now!"
+    #endif
+    typedef uint8_t mtype_t;    //mat data type
+    typedef uint8_t wtype_t;    //weight data type
+    typedef uint8_t btype_t;    //bias data type
+    typedef float sumtype_t;    //sum data type 
+    typedef float zptype_t;     //zeropoint data type
+#else 
+    #error "Not support this MDL_TYPE!"
+#endif
+
+#if TM_MDL_TYPE == TM_MDL_FP8_143
+    #define TM_FP8_SCNT (1)
+    #define TM_FP8_ECNT (4)
+    #define TM_FP8_MCNT (3)
+    #define TM_FP8_BIAS (9)
+#elif TM_MDL_TYPE == TM_MDL_FP8_152
+    #define TM_FP8_SCNT (1)
+    #define TM_FP8_ECNT (5)
+    #define TM_FP8_MCNT (2)
+    #define TM_FP8_BIAS (15)
+#endif
+
+typedef float sctype_t;
+#define TM_FASTSCALE_SHIFT (8)
+
+/******************************* ENUM ************************************/
+typedef enum{
+    TM_OK = 0,
+    TM_ERR= 1,
+    TM_ERR_MAGIC     = 2,
+    TM_ERR_UNSUPPORT = 3,
+    TM_ERR_OOM       = 4,
+    TM_ERR_LAYERTYPE = 5,
+    TM_ERR_DIMS      = 6,
+    TM_ERR_TODO      = 7,
+    TM_ERR_MDLTYPE   = 8,
+    TM_ERR_KSIZE     = 9,
+}tm_err_t;
+
+typedef enum{
+    TML_CONV2D    = 0,
+    TML_GAP       = 1,
+    TML_FC        = 2,
+    TML_SOFTMAX   = 3,
+    TML_RESHAPE   = 4,
+    TML_DWCONV2D  = 5,
+    TML_ADD       = 6,
+    TML_MAXCNT    ,
+}tm_layer_type_t;
+
+typedef enum{
+    TM_PAD_VALID  = 0,
+    TM_PAD_SAME   = 1,
+}tm_pad_type_t;
+
+typedef enum{
+    TM_ACT_NONE   = 0,
+    TM_ACT_RELU   = 1,
+    TM_ACT_RELU1  = 2,
+    TM_ACT_RELU6  = 3,
+    TM_ACT_TANH   = 4,
+    TM_ACT_SIGNBIT= 5,
+    TM_ACT_MAXCNT ,
+}tm_act_type_t;
+
+
+typedef enum {
+    TMPP_NONE      = 0,
+    TMPP_FP2INT    = 1,  //user own fp buf -> int input buf
+    TMPP_UINT2INT  = 2,  //int8: cvt in place; int16: can't cvt in place
+    TMPP_UINT2FP01 = 3,  // u8/255.0
+    TMPP_UINT2FPN11= 4,  // (u8-128)/128 
+    TMPP_UINT2DTYPE= 5,  //uint8 to fp16,fp8
+    TMPP_MAXCNT,
+}tm_pp_t;
+
+/******************************* STRUCT ************************************/
+//mdlbin in flash
+typedef struct{
+    uint32_t magic;         //"MAIX"
+    uint8_t  mdl_type;      //0 int8, 1 int16, 2 fp32,
+    uint8_t  out_deq;       //0 don't dequant out; 1 dequant out
+    uint16_t input_cnt;     //only support 1 yet
+    uint16_t output_cnt;    //only support 1 yet
+    uint16_t layer_cnt;     
+    uint32_t buf_size;      //main buf size for middle result = pingpong+keep
+    uint32_t sub_size;      //pingpong buf size;
+    uint16_t in_dims[4];    //0:dims; 1:dim0; 2:dim1; 3:dim2
+    uint16_t out_dims[4];
+    uint8_t  reserve[28];   //reserve for future
+    uint8_t  layers_body[0];//oft 64 here
+}tm_mdlbin_t;
+
+//mdl meta data in ram
+typedef struct{
+    tm_mdlbin_t* b;         //bin
+    void*    cb;            //Layer callback
+    uint8_t* buf;           //main buf addr
+    uint8_t* subbuf;        //sub buf addr
+    uint16_t main_alloc;    //is main buf alloc or static
+    uint16_t layer_i;       //current layer index
+    uint8_t* layer_body;    //current layer body addr
+}tm_mdl_t;
+
+//dims==3, hwc
+//dims==2, 1wc
+//dims==1, 11c
+typedef struct{
+    uint16_t dims;
+    uint16_t h;
+    uint16_t w;
+    uint16_t c;
+    union {
+        mtype_t* data;
+        float*   dataf;
+    };
+}tm_mat_t;
+
+/******************************* LAYER STRUCT ************************************/
+typedef struct{             //48byte
+    uint16_t type;          //layer type
+    uint16_t is_out;        //is output
+    uint32_t size;          //8 byte align size for this layer
+    uint32_t in_oft;        //input  oft in main buf
+    uint32_t out_oft;       //output oft in main buf
+    uint16_t in_dims[4];    //0:dims; 1:dim0; 2:dim1; 3:dim2
+    uint16_t out_dims[4];
+                            //following unit not used in fp32 mode
+    sctype_t in_s;          //input scale, 
+    zptype_t in_zp;         //input zeropoint
+    sctype_t out_s;         //output scale
+    zptype_t out_zp;        //output zeropoint
+    //note: real = scale*(q-zeropoint)
+}tml_head_t;
+
+typedef struct{
+    tml_head_t h;
+
+    uint8_t  kernel_w;
+    uint8_t  kernel_h;
+    uint8_t  stride_w;
+    uint8_t  stride_h;
+    
+    uint8_t  dilation_w;
+    uint8_t  dilation_h;
+    uint16_t  act;          //0 none, 1 relu, 2 relu1, 3 relu6, 4 tanh, 5 sign_bit
+    
+    uint8_t  pad[4];        //top,bottom,left,right
+
+    uint32_t depth_mul;     //depth_multiplier: if conv2d,=0; else: >=1
+    uint32_t reserve;       //for 8byte align
+    
+    uint32_t ws_oft;        //weight scale oft from this layer start 
+                            //skip bias scale: bias_scale = weight_scale*in_scale
+    uint32_t w_oft;         //weight oft from this layer start
+    uint32_t b_oft;         //bias oft from this layer start 
+    //note: bias[c] = bias[c] + (-out_zp)*sum(w[c*chi*maxk:(c+1)*chi*maxk])
+    //      fused in advance (when convert model)
+}tml_conv2d_dw_t;  //compatible with conv2d and dwconv2d
+
+typedef struct{
+    tml_head_t h;
+}tml_gap_t;
+
+typedef struct{
+    tml_head_t h;
+
+    uint32_t ws_oft;        //weight scale oft from this layer start 
+    uint32_t w_oft;         //weight oft from this layer start
+    uint32_t b_oft;         //bias oft from this layer start
+    uint32_t reserve;       //for 8byte align
+}tml_fc_t;
+
+typedef struct{
+    tml_head_t h;
+}tml_softmax_t;
+
+typedef struct{
+    tml_head_t h;
+}tml_reshape_t;
+
+typedef struct{
+    tml_head_t h;
+
+    uint8_t  kernel_w;
+    uint8_t  kernel_h;
+    uint8_t  stride_w;
+    uint8_t  stride_h;
+    
+    uint8_t  dilation_w;
+    uint8_t  dilation_h;
+    uint16_t  act;          //0 none, 1 relu, 2 relu1, 3 relu6, 4 tanh, 5 sign_bit
+    
+    uint8_t  pad[4];        //top,bottom,left,right
+
+
+    
+    uint32_t ws_oft;        //weight scale oft from this layer start 
+                            //skip bias scale: bias_scale = weight_scale*in_scale
+    uint32_t w_oft;         //weight oft from this layer start
+    uint32_t b_oft;         //bias oft from this layer start 
+    //note: bias[c] = bias[c] + (-out_zp)*sum(w[c*chi*maxk:(c+1)*chi*maxk])
+    //      fused in advance (when convert model)
+}tml_dwconv2d_t;
+
+typedef struct{
+    tml_head_t h;
+    uint32_t in_oft1;
+    sctype_t in_s1;          //input scale, 
+    zptype_t in_zp1;         //input zeropoint
+    uint32_t reserve;        //align8
+}tml_add_t;
+
+
+/******************************* TYPE ************************************/
+typedef tm_err_t (*tml_stat_t)(tml_head_t* layer, tm_mat_t* in, tm_mat_t* out);
+typedef tm_err_t (*tm_cb_t)(tm_mdl_t* mdl, tml_head_t* lh);
+
+
+/******************************* GLOBAL VARIABLE ************************************/
+
+
+/******************************* MODEL FUNCTION ************************************/
+tm_err_t tm_load  (tm_mdl_t* mdl, const uint8_t* bin, uint8_t*buf, tm_cb_t cb, tm_mat_t* in);   //load model
+void     tm_unload(tm_mdl_t* mdl);                                      //remove model
+tm_err_t tm_preprocess(tm_mdl_t* mdl, tm_pp_t pp_type, tm_mat_t* in, tm_mat_t* out);            //preprocess input data
+tm_err_t tm_run   (tm_mdl_t* mdl, tm_mat_t* in, tm_mat_t* out);         //run model
+
+
+/******************************* LAYER FUNCTION ************************************/
+tm_err_t tml_conv2d_dwconv2d(tm_mat_t* in, tm_mat_t* out, wtype_t* w, btype_t* b, \
+    int kw, int kh, int sx, int sy, int dx, int dy, int act, \
+    int pad_top, int pad_bottom, int pad_left, int pad_right, int dmul, \
+    sctype_t* ws, sctype_t in_s, zptype_t in_zp, sctype_t out_s, zptype_t out_zp);
+tm_err_t tml_gap(tm_mat_t* in, tm_mat_t* out, sctype_t in_s, zptype_t in_zp, sctype_t out_s, zptype_t out_zp);
+tm_err_t tml_fc(tm_mat_t* in, tm_mat_t* out,  wtype_t* w, btype_t* b, \
+    sctype_t* ws, sctype_t in_s, zptype_t in_zp, sctype_t out_s, zptype_t out_zp);
+tm_err_t tml_softmax(tm_mat_t* in, tm_mat_t* out, sctype_t in_s, zptype_t in_zp, sctype_t out_s, zptype_t out_zp);
+tm_err_t tml_reshape(tm_mat_t* in, tm_mat_t* out, sctype_t in_s, zptype_t in_zp, sctype_t out_s, zptype_t out_zp);
+tm_err_t tml_add(tm_mat_t* in0, tm_mat_t* in1, tm_mat_t* out, \
+    sctype_t in_s0, zptype_t in_zp0, sctype_t in_s1, zptype_t in_zp1, sctype_t out_s, zptype_t out_zp);
+
+/******************************* STAT FUNCTION ************************************/
+#if TM_ENABLE_STAT
+tm_err_t tm_stat(tm_mdlbin_t* mdl);                    //stat model
+#endif
+
+/******************************* UTILS FUNCTION ************************************/
+uint8_t TM_WEAK tm_fp32to8(float fp32);
+float TM_WEAK tm_fp8to32(uint8_t fp8);
+
+
+/******************************* UTILS  ************************************/
+
+#define TML_GET_INPUT(mdl,lh)   ((mtype_t*)((mdl)->buf + (lh)->in_oft))
+#define TML_GET_OUTPUT(mdl,lh)  ((mtype_t*)((mdl)->buf + (lh)->out_oft))
+#if (TM_MDL_TYPE == TM_MDL_INT8)||(TM_MDL_TYPE == TM_MDL_INT16)
+    #define TML_DEQUANT(lh, x)       (((sumtype_t)(x)-((lh)->out_zp))*((lh)->out_s))
+    #define TM_DEQUANT(i8,s,zp) (((sumtype_t)(i8)-(zp))*(s))
+    #define TM_QUANT(fp32,s,zp) ((mtype_t)((fp32)/(s)+zp))
+#elif (TM_MDL_TYPE == TM_MDL_FP8_143) || (TM_MDL_TYPE == TM_MDL_FP8_152)
+    #define TML_DEQUANT(lh, x)  (tm_fp8to32(x))
+#else   //FP32,FP16
+    #define TML_DEQUANT(lh, x)  ((float)(x))
+    #define TM_DEQUANT(x,s,zp)  (x)
+    #define TM_QUANT(x,s,zp)    (x)
+#endif
+
+/******************************* LOCAL MATH FUNCTION  ************************************/
+#if TM_LOCAL_MATH
+//http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html
+static inline float _exp(float x) {
+        float p = 1.442695040f * x;
+        uint32_t i = 0;
+        uint32_t sign = (i >> 31);
+        int w = (int) p;
+        float z = p - (float) w + (float) sign;
+        union {
+            uint32_t i;
+            float f;
+        } v = {.i = (uint32_t) ((1 << 23) * (p + 121.2740838f + 27.7280233f / (4.84252568f - z) - 1.49012907f * z))};
+        return v.f;
+    }
+    #define tm_exp _exp   //maybe some arch have exp acceleration, use macro in arch_xxx.h to reload it
+#else
+    #define tm_exp exp
+#endif
+
+#endif 
--- a/examples/tinymaix/tm_port.h
+++ b/examples/tinymaix/tm_port.h
@ -0,0 +1,93 @@
+/* Copyright 2022 Sipeed Technology Co., Ltd. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef __TM_PORT_H
+#define __TM_PORT_H
+
+#define TM_ARCH_CPU         (0) //default, pure cpu compute
+#define TM_ARCH_ARM_SIMD    (1) //ARM Cortex M4/M7, etc.
+#define TM_ARCH_ARM_NEON    (2) //ARM Cortex A7, etc.
+#define TM_ARCH_ARM_MVEI    (3) //ARMv8.1: M55, etc.
+#define TM_ARCH_RV32P       (4) //T-head E907, etc.
+#define TM_ARCH_RV64V       (5) //T-head C906,C910, etc.
+#define TM_ARCH_CSKYV2      (6) //cskyv2 with dsp core
+#define TM_ARCH_X86_SSE2    (7) //x86 sse2
+
+#define TM_OPT0             (0) //default, least code and buf
+#define TM_OPT1             (1) //opt for speed, need more code and buf
+#define TM_OPT2             (2) //TODO
+
+/******************************* PORT CONFIG  ************************************/
+#define TM_ARCH         TM_ARCH_RV32P
+#define TM_OPT_LEVEL    TM_OPT1
+#define TM_MDL_TYPE     TM_MDL_INT8
+#define TM_FASTSCALE    (0)         //enable if your chip don't have FPU, may speed up 1/3, but decrease accuracy
+#define TM_LOCAL_MATH   (0)         //use local math func (like exp()) to avoid libm
+#define TM_ENABLE_STAT  (1)         //enable mdl stat functions
+#define TM_MAX_CSIZE    (1000)      //max channel num //used if INT8 mdl  //cost TM_MAX_CSIZE*4 Byte
+#define TM_MAX_KSIZE    (5*5)       //max kernel_size   //cost TM_MAX_KSIZE*4 Byte
+#define TM_MAX_KCSIZE   (3*3*256)   //max kernel_size*channels //cost TM_MAX_KSIZE*sizeof(mtype_t) Byte
+
+#define TM_INLINE       __attribute__((always_inline)) static inline
+#define TM_WEAK         __attribute__((weak))
+
+#define tm_malloc(x)    malloc(x)
+#define tm_free(x)      free(x)
+
+
+#define TM_PRINTF(...) printf(__VA_ARGS__)
+#define TM_DBG(...)    TM_PRINTF("###L%d: ",__LINE__);TM_PRINTF(__VA_ARGS__);
+#define TM_DBGL()      TM_PRINTF("###L%d\n",__LINE__);
+
+/******************************* DBG TIME CONFIG  ************************************/
+#include "bflb_mtimer.h"
+#define  TM_GET_US()       bflb_mtimer_get_time_us()
+
+#define TM_DBGT_INIT()     uint32_t _start,_finish;float _time;_start=TM_GET_US();
+#define TM_DBGT_START()    _start=TM_GET_US();
+#define TM_DBGT(x)         {_finish=TM_GET_US();\
+                            _time = (float)(_finish-_start)/1000.0;\
+                            TM_PRINTF("===%s use %.3f ms\n", (x), _time);\
+                            _start=TM_GET_US();}
+
+/******************************* DBG PERFORMANCE CONFIG  ************************************/
+//need clock tick to make accurate statistics
+#define TM_EN_PERF 0
+
+#if TM_EN_PERF
+    #define  TM_GET_TICK(x)     __ASM volatile("csrr %0, mcycle" : "=r"(x)); //edit your self
+
+    #define  TM_TICK_PERUS    (380) //sysconf(_SC_CLK_TCK)/1000000)
+    #define  TM_PERF_REG(x)    uint64_t x=0;
+    #define  TM_PERF_EXTREG(x) extern uint64_t x;
+    #define  TM_PERF_INIT(x)   uint64_t _##x##_t0, _##x##_t1;
+    #define  TM_PERF_START(x)  TM_GET_TICK(_##x##_t0);
+    #define  TM_PERF_ADD(x)   {TM_GET_TICK(_##x##_t1);(x)+=(_##x##_t1-_##x##_t0);TM_GET_TICK(_##x##_t0);};
+    #define  TM_PERF_PRINT(x) TM_PRINTF("PERF "#x": %ld us\r\n", (x)/TM_TICK_PERUS)
+#else
+    #define  TM_GET_TICK(x)
+    #define  TM_TICK_PERUS
+    #define  TM_PERF_REG(x)
+    #define  TM_PERF_EXTREG(x)
+    #define  TM_PERF_INIT(x)
+    #define  TM_PERF_START(x)
+    #define  TM_PERF_ADD(x)
+    #define  TM_PERF_PRINT(x)
+#endif
+
+
+/******************************* OPS CONFIG  ************************************/
+
+
+
+
+#endif