【HZ-RK3568开发板免费体验】使用RK3568 的 NPU 实现物体识别

本主题由 jf_26932943 于 2025-9-9 14:06 移动

1 数据准备和模型准备

数据标记，可以使用labelimg来标注需要识别的物体

标注完成之后需要把XML 格式的标注数据转换为 YOLO 的格式。

格式转换工具如下：

import os
import xml.etree.ElementTree as ET

classes = ["coke"]

def convert(size, box):
dw = 1.0 / size[0]
dh = 1.0 / size[1]
x = (box[0] + box[1]) / 2.0
y = (box[2] + box[3]) / 2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x * dw
w = w * dw
y = y * dh
h = h * dh
return (x, y, w, h)

def convert_annotation(xml_file, output_dir):
try:
tree = ET.parse(xml_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)

file_name = os.path.splitext(os.path.basename(xml_file))[0]
out_file_path = os.path.join(output_dir, f"{file_name}.txt")

with open(out_file_path, 'w') as out_file:
for obj in root.iter('object'):
difficult = obj.find('difficult')
difficult_text = difficult.text if difficult is not None else "0"
cls = obj.find('name').text

if cls not in classes or int(difficult_text) == 1:
continue

cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text),
float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
bb = convert((w, h), b)
out_file.write(f"{cls_id} {' '.join([str(a) for a in bb])}\n")

print(f"Converted {xml_file} to {out_file_path}")

except Exception as e:
print(f"Error processing {xml_file}: {str(e)}")

xml_dir = "annotations"
output_dir = "labels"

os.makedirs(output_dir, exist_ok=True)

for xml_file in os.listdir(xml_dir):
if xml_file.endswith('.xml'):
convert_annotation(os.path.join(xml_dir, xml_file), output_dir)

print("Conversion completed!")

然后准备数据集配置文件，配置为后缀为.xml

path: /path/to/coke_dataset
train: images
val: images
nc: 1
names: ['coke']

然后克隆 yolov5 来训练模型

git clone https://github.com/ultralytics/yolov5.git
cd yolov5
pip install -r requirements.txt

python train.py
--img 640
--batch 16
--epochs 50
--data /path/to/coke.yaml
--weights yolov5s.pt
--name coke_detection

训练完成之后，需要到处 onnx 的格式
python export.py
--weights runs/train/coke_detection/weights/best.pt
--img 640
--batch 1
--include onnx
--opset 12
--simplify

需要将 ONNX 格式转换为 RKNN 格式

import os
import numpy as np
from rknn.api import RKNN
from PIL import Image
import glob

def create_calibration_dataset(calib_path, output_file, size=(640, 640))
os.makedirs('calib_data_preprocessed', exist_ok=True)
with open(output_file, 'w') as f:
image_files = glob.glob(os.path.join(calib_path, '.jpg')) +
glob.glob(os.path.join(calib_path, '.jpeg')) +
glob.glob(os.path.join(calib_path, '*.png'))

for img_path in image_files[:10]:
try:
img = Image.open(img_path).convert('RGB')
img = img.resize(size, Image.Resampling.LANCZOS)

img_name = os.path.basename(img_path)
preprocessed_path = os.path.join('calib_data_preprocessed', img_name)
img.save(preprocessed_path)

f.write(f"{preprocessed_path}\n")
print(f"Added {preprocessed_path} to calibration dataset")

except Exception as e:
print(f"Error processing {img_path}: {str(e)}")

def main():
rknn = RKNN(verbose=True)

calib_path = "path/to/your/calibration/images"
dataset_file = "dataset.txt"
create_calibration_dataset(calib_path, dataset_file)

print('--> Config model')
ret = rknn.config(
target_platform='rk3568',
mean_values=[[0, 0, 0]],
std_values=[[255, 255, 255]],
quantized_dtype='asymmetric_quantized-8',
quantized_algorithm='normal',
optimization_level=3,
quantize=True,
batch_size=1
)
if ret != 0:
print('Config failed!')
exit(ret)
print('done')

print('--> Loading model')
ret = rknn.load_onnx(
model='best.onnx',
inputs=['images'],
outputs=['output'],
input_size_list=[[1, 3, 640, 640]]
)
if ret != 0:
print('Load model failed!')
exit(ret)
print('done')

print('--> Building model')
ret = rknn.build(
do_quantization=True,
dataset=dataset_file
)
if ret != 0:
print('Build model failed!')
exit(ret)
print('done')

print('--> Export rknn model')
ret = rknn.export_rknn('coke_detector.rknn')
if ret != 0:
print('Export rknn model failed!')
exit(ret)
print('done')

print('--> Init runtime environment')
ret = rknn.init_runtime()
if ret != 0:
print('Init runtime environment failed!')
exit(ret)

try:
img = Image.open('test.jpg').convert('RGB')
img = img.resize((640, 640), Image.Resampling.LANCZOS)
img = np.array(img).astype(np.float32) / 255.0
img = np.transpose(img, (2, 0, 1))
img = np.expand_dims(img, axis=0)

print('--> Running inference')
outputs = rknn.inference(inputs=[img])
print('Inference done!')

print(f'Output shape: {outputs[0].shape}')

except Exception as e:
print(f'Error during inference test: {str(e)}')

rknn.release()
print('Model conversion completed successfully!')

if name == 'main':
main()

使用RKNN-Toolkit2库来编写 demo

#ifndef RKNN_DEMO_H
#define RKNN_DEMO_H

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <math.h>
#include "rknn_api.h"

#define MODEL_IN_WIDTH 640
#define MODEL_IN_HEIGHT 640
#define MODEL_IN_CHNS 3
#define NUM_CLASSES 1
#define NUM_ANCHORS 3
#define CONF_THRESHOLD 0.5
#define NMS_THRESHOLD 0.5
#define MAX_DETECTIONS 100

typedef struct {
int x1;
int y1;
int x2;
int y2;
float score;
int cls_id;
} detection_result;

static const float anchors[3][6] = {
{10.0, 13.0, 16.0, 30.0, 33.0, 23.0}, // P3/8
{30.0, 61.0, 62.0, 45.0, 59.0, 119.0}, // P4/16
{116.0, 90.0, 156.0, 198.0, 373.0, 326.0} // P5/32
};

int load_model(const char* model_path, rknn_context* ctx, rknn_input_output_num* io_num);
int preprocess_image(const char* image_path, unsigned char* input_data, int* orig_width, int* orig_height);
int post_process(float* outputs[], rknn_output_attr* out_attrs, int num_outputs,
int orig_width, int orig_height, detection_result* results, int* result_count);
void draw_boxes(const char* image_path, detection_result* results, int result_count, const char* output_path);
float sigmoid(float x);
float calculate_iou(detection_result a, detection_result b);
void nms(detection_result* results, int* result_count);

#endif

#include "rknn_demo.h"

void resize_image(const unsigned char* src, int src_width, int src_height,
unsigned char* dst, int dst_width, int dst_height) {
float scale_x = (float)src_width / dst_width;
float scale_y = (float)src_height / dst_height;

for (int y = 0; y < dst_height; y++) {
for (int x = 0; x < dst_width; x++) {
int src_x = (int)(x * scale_x);
int src_y = (int)(y * scale_y);

if (src_x >= src_width) src_x = src_width - 1;
if (src_y >= src_height) src_y = src_height - 1;

for (int c = 0; c < 3; c++) {
dst[(y * dst_width + x) * 3 + c] = src[(src_y * src_width + src_x) * 3 + c];
}

}

int main(int argc, char** argv) {
if (argc != 3) {
printf("Usage: %s <rknn_model> <image_path>\n", argv[0]);
return -1;
}

const char* model_path = argv[1];
const char* image_path = argv[2];
const char* output_path = "result.jpg";

rknn_context ctx = 0;
rknn_input_output_num io_num;
int ret = 0;
int orig_width, orig_height;

printf("Loading model...\n");
ret = load_model(model_path, &ctx, &io_num);
if (ret != 0) {
printf("Load model failed!\n");
return -1;
}

unsigned char* input_data = (unsigned char*)malloc(MODEL_IN_WIDTH * MODEL_IN_HEIGHT * MODEL_IN_CHNS);
if (input_data == NULL) {
printf("Malloc input memory failed!\n");
rknn_destroy(ctx);
return -1;
}

printf("Preprocessing image...\n");
ret = preprocess_image(image_path, input_data, &orig_width, &orig_height);
if (ret != 0) {
printf("Preprocess image failed!\n");
free(input_data);
rknn_destroy(ctx);
return -1;
}
float* output_buffers[io_num.n_output];
rknn_output_attr out_attrs[io_num.n_output];

for (int i = 0; i < io_num.n_output; i++) {
memset(&out_attrs[i], 0, sizeof(out_attrs[i]));
out_attrs[i].index = i;
ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &out_attrs[i], sizeof(out_attrs[i]));
if (ret != RKNN_SUCC) {
printf("Query output attribute failed! ret=%d\n", ret);
free(input_data);
for (int j = 0; j < i; j++) {
free(output_buffers[j]);
}
rknn_destroy(ctx);
return -1;
}

output_buffers[i] = (float*)malloc(out_attrs[i].size);
if (output_buffers[i] == NULL) {
printf("Malloc output memory failed!\n");
free(input_data);
for (int j = 0; j < i; j++) {
free(output_buffers[j]);
}
rknn_destroy(ctx);
return -1;
}
printf("Output %d: size=%d\n", i, out_attrs[i].size);

}
rknn_input inputs[1];
inputs[0].index = 0;
inputs[0].buf = input_data;
inputs[0].size = MODEL_IN_WIDTH * MODEL_IN_HEIGHT * MODEL_IN_CHNS;
inputs[0].pass_through = 0;
inputs[0].type = RKNN_TENSOR_UINT8;
inputs[0].fmt = RKNN_TENSOR_NHWC;

ret = rknn_inputs_set(ctx, 1, inputs);
if (ret != RKNN_SUCC) {
printf("Set inputs failed! ret=%d\n", ret);
free(input_data);
for (int i = 0; i < io_num.n_output; i++) {
free(output_buffers[i]);
}
rknn_destroy(ctx);
return -1;
}
printf("Running inference...\n");
struct timeval start_time, end_time;
gettimeofday(&start_time, NULL);

ret = rknn_run(ctx, NULL);
if (ret != RKNN_SUCC) {
printf("Run model failed! ret=%d\n", ret);
free(input_data);
for (int i = 0; i < io_num.n_output; i++) {
free(output_buffers[i]);
}
rknn_destroy(ctx);
return -1;
}
rknn_output outputs[io_num.n_output];
for (int i = 0; i < io_num.n_output; i++) {
outputs[i].want_float = 1;
outputs[i].is_prealloc = 1;
outputs[i].buf = output_buffers[i];
outputs[i].size = out_attrs[i].size;
outputs[i].index = i;
}

ret = rknn_outputs_get(ctx, io_num.n_output, outputs, NULL);
if (ret != RKNN_SUCC) {
printf("Get outputs failed! ret=%d\n", ret);
free(input_data);
for (int i = 0; i < io_num.n_output; i++) {
free(output_buffers[i]);
}
rknn_destroy(ctx);
return -1;
}

gettimeofday(&end_time, NULL);
printf("Inference time: %f ms\n",
(end_time.tv_sec - start_time.tv_sec) * 1000 +
(end_time.tv_usec - start_time.tv_usec) / 1000.0);

printf("Postprocessing...\n");
detection_result results[MAX_DETECTIONS];
int result_count = 0;

float* output_ptrs[io_num.n_output];
for (int i = 0; i < io_num.n_output; i++) {
output_ptrs[i] = output_buffers[i];
}

ret = post_process(output_ptrs, out_attrs, io_num.n_output, orig_width, orig_height, results, &result_count);
if (ret != 0) {
printf("Post process failed!\n");
} else {
printf("Detected %d objects\n", result_count);
for (int i = 0; i < result_count; i++) {
printf(" %d: class=%d, score=%.2f, box=[%d, %d, %d, %d]\n",
i, results[i].cls_id, results[i].score,
results[i].x1, results[i].y1, results[i].x2, results[i].y2);
}
}

draw_boxes(image_path, results, result_count, output_path);
printf("Result saved to %s\n", output_path);

free(input_data);
for (int i = 0; i < io_num.n_output; i++) {
free(output_buffers[i]);
}
rknn_destroy(ctx);

return 0;

}

int load_model(const char* model_path, rknn_context* ctx, rknn_input_output_num* io_num) {
int ret = 0;
ret = rknn_init(ctx, model_path, 0, 0, NULL);
if (ret != RKNN_SUCC) {
printf("rknn_init failed! ret=%d\n", ret);
return -1;
}

ret = rknn_query(*ctx, RKNN_QUERY_IN_OUT_NUM, io_num, sizeof(*io_num));
if (ret != RKNN_SUCC) {
printf("rknn_query failed! ret=%d\n", ret);
rknn_destroy(*ctx);
return -1;
}

printf("Model input num: %d, output num: %d\n", io_num->n_input, io_num->n_output);

return 0;

}

int preprocess_image(const char* image_path, unsigned char* input_data, int* orig_width, int* orig_height) {
FILE* fp = fopen(image_path, "rb");
if (fp == NULL) {
printf("Failed to open image: %s\n", image_path);
return -1;
}

fseek(fp, 0, SEEK_END);
long file_size = ftell(fp);
fseek(fp, 0, SEEK_SET);

unsigned char* file_data = (unsigned char*)malloc(file_size);
if (file_data == NULL) {
printf("Failed to allocate memory for image\n");
fclose(fp);
return -1;
}

if (fread(file_data, 1, file_size, fp) != file_size) {
printf("Failed to read image data\n");
free(file_data);
fclose(fp);
return -1;
}
fclose(fp);

*orig_width = 640;
*orig_height = 480;

unsigned char* temp_data = (unsigned char*)malloc(MODEL_IN_WIDTH * MODEL_IN_HEIGHT * 3);
if (temp_data == NULL) {
printf("Failed to allocate temporary memory\n");
free(file_data);
return -1;
}

resize_image(file_data, *orig_width, *orig_height, temp_data, MODEL_IN_WIDTH, MODEL_IN_HEIGHT);

for (int i = 0; i < MODEL_IN_WIDTH * MODEL_IN_HEIGHT * 3; i++) {
input_data[i] = temp_data[i];
}

free(file_data);
free(temp_data);

return 0;

}

int post_process(float* outputs[], rknn_output_attr* out_attrs, int num_outputs,
int orig_width, int orig_height, detection_result* results, int* result_count) {
*result_count = 0;

for (int i = 0; i < num_outputs; i++) {
int grid_size = out_attrs[i].dims[2]; // 假设维度为 [1, 85, grid, grid]
int num_elements = out_attrs[i].n_elems;

printf("Output %d: grid_size=%d, num_elements=%d\n", i, grid_size, num_elements);

for (int j = 0; j < num_elements && *result_count < MAX_DETECTIONS; j += (NUM_CLASSES + 5)) {
float confidence = outputs[i][j + 4];

if (confidence < CONF_THRESHOLD) {
continue;
}

float max_class_prob = 0;
int class_id = 0;
for (int c = 0; c < NUM_CLASSES; c++) {
float class_prob = outputs[i][j + 5 + c];
if (class_prob > max_class_prob) {
max_class_prob = class_prob;
class_id = c;
}
}

float final_score = confidence * max_class_prob;
if (final_score < CONF_THRESHOLD) {
continue;
}

float center_x = outputs[i][j + 0] * orig_width;
float center_y = outputs[i][j + 1] * orig_height;
float width = outputs[i][j + 2] * orig_width;
float height = outputs[i][j + 3] * orig_height;

results[*result_count].x1 = center_x - width / 2;
results[*result_count].y1 = center_y - height / 2;
results[*result_count].x2 = center_x + width / 2;
results[*result_count].y2 = center_y + height / 2;
results[*result_count].score = final_score;
results[*result_count].cls_id = class_id;

(*result_count)++;

}

}
nms(results, result_count);

return 0;

}

void draw_boxes(const char* image_path, detection_result* results, int result_count, const char* output_path) {
printf("Drawing %d boxes on image...\n", result_count);

FILE* fp = fopen(output_path, "wb");
if (fp == NULL) {
printf("Failed to create output image\n");
return;
}

fprintf(fp, "Detected %d objects:\n", result_count);
for (int i = 0; i < result_count; i++) {
fprintf(fp, " %d: class=%d, score=%.2f, box=[%d, %d, %d, %d]\n",
i, results[i].cls_id, results[i].score,
results[i].x1, results[i].y1, results[i].x2, results[i].y2);
}

fclose(fp);

}

float sigmoid(float x) {
return 1.0f / (1.0f + expf(-x));
}

float calculate_iou(detection_result a, detection_result b) {
int area_a = (a.x2 - a.x1) * (a.y2 - a.y1);
int area_b = (b.x2 - b.x1) * (b.y2 - b.y1);

int inter_x1 = a.x1 > b.x1 ? a.x1 : b.x1;
int inter_y1 = a.y1 > b.y1 ? a.y1 : b.y1;
int inter_x2 = a.x2 < b.x2 ? a.x2 : b.x2;
int inter_y2 = a.y2 < b.y2 ? a.y2 : b.y2;

if (inter_x2 < inter_x1 || inter_y2 < inter_y1) {
return 0.0f;
}

int inter_area = (inter_x2 - inter_x1) * (inter_y2 - inter_y1);
return (float)inter_area / (area_a + area_b - inter_area);

}

void nms(detection_result* results, int* result_count) {
for (int i = 0; i < *result_count - 1; i++) {
for (int j = i + 1; j < *result_count; j++) {
if (results[i].score < results[j].score) {
detection_result temp = results[i];
results[i] = results[j];
results[j] = temp;
}
}
}

for (int i = 0; i < *result_count; i++) {
if (results[i].score == 0) continue;

for (int j = i + 1; j < *result_count; j++) {
if (results[j].score == 0) continue;

if (calculate_iou(results[i], results[j]) > NMS_THRESHOLD) {
results[j].score = 0; // 标记为移除
}

}
}

int count = 0;
for (int i = 0; i < *result_count; i++) {
if (results[i].score > 0) {
results[count] = results[i];
count++;
}
}
*result_count = count;

}