完善资料让更多小伙伴认识你,还能领取20积分哦, 立即完善>
|
|
相关推荐
1个回答
|
|
一、模型与算法
采用MIT开源的TSM算法,论文作者通过对特征进行shift操作,在不增加额外参数和算力的情况下进行时间建模,然后移植到了各种手机上进行了各种算法的对比。作者代码也移植到了jetson nano上能达到实时运行(没有使用CUDA),甚至在FPGA上也进行了移植,满满的工业风。 二、效果展示 模型backbone是mobileV2,然而运行只有18FPS,不应该啊,看了直播才知道,初代AIPU对分组卷积不太友好,ok好吧。我也尝试了在R329 CPU上运行,只有3FPS。模型在笔记本i7 cpu下TF pb模型能有65FPS,ONNX 120FPS。 三、移植到AIPU 3.1模型量化 为了防止出现未知错误,我直接把作者的pytorch代码重构为TensoFlow代码,后面会开源。 由于涉及到时间建模,模型输入有11个输入矩阵。1个图像矩阵,10个转移矩阵,输出也是11个矩阵,1个分类矩阵,10个转移矩阵。 cfg配置如下: [[Common] mode = build use_aqt = True [Parser] model_name = mobilenet_v2 model_type = tensorflow detection_postprocess = model_domain = image_classification output = fully_connected/BiasAdd,Slice,Slice_2,Slice_4,Slice_6,Slice_8,Slice_10,Slice_12,Slice_14,Slice_16,Slice_18 input_model = ./input/model3.pb input = x_input,x_input0,x_input1,x_input2,x_input3,x_input4,x_input5,x_input6,x_input7,x_input8,x_input9 input_shape = [1, 224, 224, 3],[1, 56, 56, 3],[1, 28, 28, 4],[1, 28, 28, 4],[1, 14, 14, 8],[1, 14, 14, 8],[1, 14, 14, 8],[1, 14, 14, 12],[1, 14, 14, 12],[1, 7, 7, 20],[1, 7, 7, 20] output_dir = ./ [AutoQuantizationTool] quantize_method = SYMMETRIC quant_precision = int8 ops_per_channel = DepthwiseConv reverse_rgb = False label_id_offset = dataset_name = detection_postprocess = anchor_generator = ts_max_file = ./input/max_dict2.npy ts_min_file = ./input/min_dict2.npy [GBuilder] outputs = aipu_mobilenet_v2Z2_1104.bin target = Z1_0701] 由于模型涉及到多输入多输出,故不能直接使用训练集模型自动量化,所以需要手动求出每个节点的最值,按照技术手册里规定的格式,分别保存max,min的值为dict文件。 3.2模型推理 需要对输出结果微调,以下为推理和微调代码。 /** * @file main.cpp * @brief * * Copyright (c) 2021 Sipeed team * ******************************** * Modify by @zhai * ******************************* */ extern "C" { #include #include #include #include #include "fbviewer.h" #include "label.h" #include #include } #include "standard_api.h" #include #include #include #include #include "opencv2/opencv.hpp" using namespace cv; #define DBG_LINE() printf("###L%drn", __LINE__) int label_oft = 0; typedef struct { int index; int8_t val; } int8_data_t; typedef struct { int index; uint8_t val; } uint8_data_t; std::list int8_t flag_frame_num=0 ; std::vector int FindMax(int8_t a[], int n, int*pMaxPos) { int i; int8_t max; max = a[0]; *pMaxPos = 0; for (i=1; i { if (a > max) { max = a; *pMaxPos = i; } } return max ; } cv::VideoCapture capture(0); int init_cam(void) { int x,y; getCurrentRes(&x, &y); printf("LCD width is %d, height is %dn", x, y); cv::Mat img; VideoCapture cap; cap.isOpened(); if(!capture.isOpened()) { std::cout<<"video not open."< return 1; } //get default video fps, set fps to 30fps double rate = capture.get(CAP_PROP_FPS); printf("rate is %lfn", rate); capture.set(CAP_PROP_FPS, 30); rate = capture.get(CAP_PROP_FPS); printf("rate is %lfn", rate); //get default video frame info double frame_width = capture.get(CAP_PROP_FRAME_WIDTH); double frame_height = capture.get(CAP_PROP_FRAME_HEIGHT); printf("frame_width is %lf, frame_height is %lfn", frame_width, frame_height); //set video frame size to QVGA (then we crop to 224x224) frame_width = 320; frame_height = 240; if(!capture.set(CAP_PROP_FRAME_WIDTH,frame_width)) { printf("set width failedn"); return 2; } if(!capture.set(CAP_PROP_FRAME_HEIGHT, frame_height)) { printf("set width failedn"); return 3; } return 0; } int init_graph(char* file_model, aipu_ctx_handle_t ** ctx, aipu_graph_desc_t* gdesc, aipu_buffer_alloc_info_t* info) { const char* status_msg =NULL; aipu_status_t status = AIPU_STATUS_SUCCESS; int ret = 0; //Step1: init ctx handle status = AIPU_init_ctx(ctx); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_init_ctx: %sn", status_msg); ret = -1; //goto out; } //Step2: load graph status = AIPU_load_graph_helper(*ctx, file_model, gdesc); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_load_graph_helper: %sn", status_msg); ret = -2; //goto deinit_ctx; } printf("[DEMO INFO] AIPU load graph successfully.n"); //Step3: alloc tensor buffers status = AIPU_alloc_tensor_buffers(*ctx, gdesc, info); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_alloc_tensor_buffers: %sn", status_msg); ret = -3; //goto unload_graph; } return ret; } int cap_img(Mat* lcd_frame, Mat* ai_frame) { Rect roi(40, 0, 240/4*3, 240); //16/9 -> 4/3 Rect input_roi(8, 8, 224, 224); Size dsize = Size(240, 240); if(!capture.read(*lcd_frame)) { printf("no video framern"); return -1; } *lcd_frame = (*lcd_frame)(roi).clone(); rotate(*lcd_frame, *lcd_frame, ROTATE_180); resize(*lcd_frame, *lcd_frame, dsize); cvtColor(*lcd_frame, *lcd_frame, COLOR_BGR2RGB); *ai_frame = (*lcd_frame)(input_roi).clone(); //*ai_frame = (*lcd_frame)(input_roi).clone() + Scalar(-123, -117,-104); ai_frame->convertTo(*ai_frame,CV_16SC3); add(Scalar(-127, -127,-127),*ai_frame,*ai_frame); ai_frame->convertTo(*ai_frame,CV_8SC3); return 0; } int infer_img(Mat* ai_frame,Mat* x0,Mat* x1,Mat* x2,Mat* x3,Mat* x4,Mat* x5,Mat* x6,Mat* x7,Mat* x8,Mat* x9, aipu_ctx_handle_t ** ctx, aipu_graph_desc_t* gdesc, aipu_buffer_alloc_info_t* info, int signed_flag, int* label_idx, int* label_prob) { uint32_t job_id=0; const char* status_msg =NULL; int32_t time_out=-1; bool finish_job_successfully = true; aipu_status_t status = AIPU_STATUS_SUCCESS; int ret = 0; //std::cout<<* ai_frame< // printf("input_x:%dn",info->inputs.tensors[0].size); // printf("input_x0:%dn",info->inputs.tensors[1].size); // printf("input_x1:%dn",info->inputs.tensors[2].size); // printf("input_x2:%dn",info->inputs.tensors[3].size); // printf("input_x3:%dn",info->inputs.tensors[4].size); // printf("input_x4:%dn",info->inputs.tensors[5].size); // printf("input_x5:%dn",info->inputs.tensors[6].size); // printf("input_x6:%dn",info->inputs.tensors[7].size); // printf("input_x7:%dn",info->inputs.tensors[8].size); // printf("input_x8:%dn",info->inputs.tensors[9].size); // printf("input_x9:%dn",info->inputs.tensors[10].size); memcpy(info->inputs.tensors[0].va, ai_frame->data,info->inputs.tensors[0].size); memcpy(info->inputs.tensors[1].va, x0->data, info->inputs.tensors[1].size); memcpy(info->inputs.tensors[2].va, x1->data, info->inputs.tensors[2].size); memcpy(info->inputs.tensors[3].va, x2->data,info->inputs.tensors[3].size); memcpy(info->inputs.tensors[4].va, x3->data, info->inputs.tensors[4].size); memcpy(info->inputs.tensors[5].va, x4->data,info->inputs.tensors[5].size); memcpy(info->inputs.tensors[6].va, x5->data,info->inputs.tensors[6].size); memcpy(info->inputs.tensors[7].va, x6->data, info->inputs.tensors[7].size); memcpy(info->inputs.tensors[8].va, x7->data,info->inputs.tensors[8].size); memcpy(info->inputs.tensors[9].va, x8->data, info->inputs.tensors[9].size); memcpy(info->inputs.tensors[10].va, x9->data,info->inputs.tensors[10].size); flag_frame_num=1; status = AIPU_create_job(*ctx, gdesc, info->handle, &job_id); //std::cout< if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_create_job: %sn", status_msg); ret = -1; //goto free_tensor_buffers; } status = AIPU_finish_job(*ctx, job_id, time_out); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_finish_job: %sn", status_msg); finish_job_successfully = false; } else { finish_job_successfully = true; } if (finish_job_successfully) { int8_t *result = (int8_t *)info->outputs.tensors[0].va; uint32_t size = info->outputs.tensors[0].size; int8_t *buff=(int8_t *)malloc(27*sizeof(int8_t)); memcpy(buff, result, sizeof(int8_t)*27); if(listbuffs.size()>5){listbuffs.pop_front();} listbuffs.push_back(buff); int8_t *buffsum; buffsum=(int8_t *)calloc(27,sizeof(int8_t)); int Len=listbuffs.size(); int k=0; std::list for(iter=listbuffs.begin();iter!=listbuffs.end();iter++){ for(k=0;k<27;k++){buffsum[k]=buffsum[k]+(*iter)[k];} } int i=0; memcpy((x0->data),info->outputs.tensors[1].va, info->outputs.tensors[1].size); memcpy((x1->data),info->outputs.tensors[2].va, info->outputs.tensors[2].size); memcpy((x2->data),info->outputs.tensors[3].va, info->outputs.tensors[3].size); memcpy((x3->data),info->outputs.tensors[4].va, info->outputs.tensors[4].size); memcpy((x4->data),info->outputs.tensors[5].va, info->outputs.tensors[5].size); memcpy((x5->data),info->outputs.tensors[6].va, info->outputs.tensors[6].size); memcpy((x6->data),info->outputs.tensors[7].va, info->outputs.tensors[7].size); memcpy((x7->data),info->outputs.tensors[8].va, info->outputs.tensors[8].size); memcpy((x8->data),info->outputs.tensors[9].va, info->outputs.tensors[9].size); memcpy((x9->data),info->outputs.tensors[10].va, info->outputs.tensors[10].size); int idx; FindMax(buffsum, 27, &idx); printf("idx:%dn",idx); int label_covert=2; int errors_fre[]={7, 8, 21, 22, 3,10,11,12,13}; i=0; for(i=0;i<9;i++){ if(idx==errors_fre){idx=history.back();} } if(idx==0){idx=history.back();} if(idx!=history.back()){ if(history[history.size()-3]!=history.back()||history[history.size()-2]!=history.back()){idx=history.back();} } history.push_back(idx); *label_idx=history.back(); if(history.size()>20){history.erase(history.begin());} } status = AIPU_clean_job(*ctx, job_id); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[TEST ERROR] AIPU_clean_job: %sn", status_msg); ret = -2; //goto free_tensor_buffers; } return ret; } float cal_fps(struct timeval start, struct timeval end) { struct timeval interval; if (end.tv_usec >= start.tv_usec) { interval.tv_usec = end.tv_usec - start.tv_usec; interval.tv_sec = end.tv_sec - start.tv_sec; } else { interval.tv_usec = 1000000 + end.tv_usec - start.tv_usec; interval.tv_sec = end.tv_sec - 1 - start.tv_sec; } float fps = 1000000.0 / interval.tv_usec; return fps; } volatile int exit_flag = 0; void my_handler(int s){ printf("Caught signal %dn",s); exit_flag = 1; return; } int main(int argc, char *argv[]) { int ret = 0; uint32_t job_id=0; int32_t time_out=-1; bool finish_job_successfully = true; int model_inw, model_inh, model_inch, model_outw, model_outh, model_outch, img_size; int8_t* bmpbuf; cv::Mat lcd_frame; cv::Mat ai_frame; int label_idx, label_prob; struct timeval start, end; int flag_frame=0; float fps=0; // cv::Mat x0 = cv::Mat::zeros(56,56 , CV_32FC(3)); // cv::Mat x1 = cv::Mat::zeros(28, 28, CV_32FC(4)); // cv::Mat x2 = cv::Mat::zeros(28, 28, CV_32FC(4)); // cv::Mat x3 = cv::Mat::zeros(14, 14, CV_32FC(8)); // cv::Mat x4 = cv::Mat::zeros(14,14 , CV_32FC(8)); // cv::Mat x5 = cv::Mat::zeros(14,14 , CV_32FC(8)); // cv::Mat x6 = cv::Mat::zeros(14, 14, CV_32FC(12)); // cv::Mat x7 = cv::Mat::zeros(14,14 , CV_32FC(12)); // cv::Mat x8 = cv::Mat::zeros(7,7 , CV_32FC(20)); // cv::Mat x9 = cv::Mat::zeros(7, 7, CV_32FC(20 )); cv::Mat x0 = cv::Mat::zeros(56,56 , CV_8SC(3)); cv::Mat x1 = cv::Mat::zeros(28, 28, CV_8SC(4)); cv::Mat x2 = cv::Mat::zeros(28, 28, CV_8SC(4)); cv::Mat x3 = cv::Mat::zeros(14, 14, CV_8SC(8)); cv::Mat x4 = cv::Mat::zeros(14,14 , CV_8SC(8)); cv::Mat x5 = cv::Mat::zeros(14,14 , CV_8SC(8)); cv::Mat x6 = cv::Mat::zeros(14, 14, CV_8SC(12)); cv::Mat x7 = cv::Mat::zeros(14,14 , CV_8SC(12)); cv::Mat x8 = cv::Mat::zeros(7,7 , CV_8SC(20)); cv::Mat x9 = cv::Mat::zeros(7, 7, CV_8SC(20 )); signal(SIGINT, my_handler); history.push_back(2); history.push_back(2); printf("Zhouyi Cam test program: rn"); printf("Usage: rn"); printf(" ./zhouyi aipu.bin signed [label_oft]rn"); printf(" signed=0, uint8 output; =1, int8 outputrn"); printf(" real_label_idx = predict_idx-label_oft, rn"); printf(" NOTE: default cal with 224x224rn"); aipu_ctx_handle_t * ctx = NULL; aipu_status_t status = AIPU_STATUS_SUCCESS; const char* status_msg =NULL; aipu_graph_desc_t gdesc; aipu_buffer_alloc_info_t info; //Step 0: parse input argv if(argc < 3) { printf("argc=%d errorrn", argc); return -1; } if(argc >3) label_oft = atoi(argv[3]); char* file_model= argv[1]; int signed_flag = 1; //Step 1: set USB camera ret = init_cam();DBG_LINE(); if(ret != 0) { printf("[DEMO ERROR] init_cam err: %sn", ret); goto out; } //Step 2: init model graph ret = init_graph(file_model, &ctx, &gdesc, &info);DBG_LINE(); if(ret == -1) goto out; else if(ret == -2) goto deinit_ctx; else if(ret == -3) goto unload_graph; //MAIN LOOP while(!exit_flag) { //1. cap cam img if(cap_img(&lcd_frame, &ai_frame) != 0) { break; } //2. infer cam img, get label gettimeofday(&start, NULL); if(flag_frame%2==0){ ret = infer_img(&ai_frame,&x0,&x1,&x2,&x3,&x4,&x5,&x6,&x7,&x8,&x9, &ctx, &gdesc, &info, signed_flag, &label_idx, &label_prob); if(ret != 0) goto free_tensor_buffers; gettimeofday(&end, NULL); fps = cal_fps(start, end); } flag_frame+=1; //3. draw lcd flip(lcd_frame, lcd_frame, 1); putText(lcd_frame, labels[label_idx], Point(0, 224), cv::FONT_HERSHEY_PLAIN, 1, Scalar(255,0,0), 2); char fps_str[16]; sprintf(fps_str, "%.1ffps", fps); putText(lcd_frame, fps_str, Point(0, 16), cv::FONT_HERSHEY_PLAIN, 1, Scalar(255,0,0), 2); fb_display(lcd_frame.data, 0, 240, 240, 0, 0, 0, 0); } free_tensor_buffers: status = AIPU_free_tensor_buffers(ctx, info.handle); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_free_tensor_buffers: %sn", status_msg); ret = -1; } unload_graph: status = AIPU_unload_graph(ctx, &gdesc); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_unload_graph: %sn", status_msg); ret = -1; } deinit_ctx: status = AIPU_deinit_ctx(ctx); if (status != AIPU_STATUS_SUCCESS) { AIPU_get_status_msg(status, &status_msg); printf("[DEMO ERROR] AIPU_deinit_ctx: %sn", status_msg); ret = -1; } out: return ret; } 四、代码地址 https://github.com/ZhaiFengYun/R329-AIPU-gesture-recognition |
|
|
|
只有小组成员才能发言,加入小组>>
全志T113双核异构处理器的使用基于Tina Linux5.0——RTOS编译开发说明
512 浏览 0 评论
1120 浏览 1 评论
2951 浏览 0 评论
为了学习内核开发,大佬手搓了一个轻量级操作系统YiYiYa OS
2896 浏览 0 评论
1288 浏览 0 评论
【开源硬件大赛】基于全志V853设计的全功能BTB学习开发板
3450浏览 8评论
3105浏览 5评论
1876浏览 4评论
全志V85x硬件设计大赛作品精选第二期,快来Pick你心目中的最佳方案
88876浏览 3评论
3612浏览 3评论
小黑屋| 手机版| Archiver| 电子发烧友 ( 湘ICP备2023018690号 )
GMT+8, 2024-11-22 10:02 , Processed in 0.552413 second(s), Total 44, Slave 38 queries .
Powered by 电子发烧友网
© 2015 bbs.elecfans.com
关注我们的微信
下载发烧友APP
电子发烧友观察
版权所有 © 湖南华秋数字科技有限公司
电子发烧友 (电路图) 湘公网安备 43011202000918 号 电信与信息服务业务经营许可证:合字B2-20210191 工商网监 湘ICP备2023018690号