【CIE全国RISC-V创新应用大赛】+基于K1的的人工智能终端及应用开发-手势-语音-人脸--第一阶段

本主题由 jf_26932943 于 2025-10-28 15:32 添加图标推荐

首先按照官方文档进行了相关流程的熟悉。

目前获得一个可以基于yolov8的pose模型进行识别语音播放等逻辑一起执行的文件。

好吧，还是先上代码？

这个只能说是前期的结果。

import os

import sys

import threading

import time

import cv2

import numpy as np

import argparse

from utils import Yolov8Pose


# 导入 TTS 模块

try:

from spacemit_tts import TTSModel, play_audio, play_wav, play_wav_non_blocking

TTS_AVAILABLE = True

except ImportError:

print("⚠️ TTS 模块不可用")

TTS_AVAILABLE = False


class GestureRecognizer:

"""手势识别器"""


def __init__(self):

# 定义手势关键点索引（COCO 姿态关键点）

self.keypoint_indices = {

'wrist': 0,

'thumb_cmc': 1, 'thumb_mcp': 2, 'thumb_ip': 3, 'thumb_tip': 4,

'index_mcp': 5, 'index_pip': 6, 'index_dip': 7, 'index_tip': 8,

'middle_mcp': 9, 'middle_pip': 10, 'middle_dip': 11, 'middle_tip': 12,

'ring_mcp': 13, 'ring_pip': 14, 'ring_dip': 15, 'ring_tip': 16,

'pinky_mcp': 17, 'pinky_pip': 18, 'pinky_dip': 19, 'pinky_tip': 20

}


def recognize_gesture(self, keypoints, scores):

"""识别手势"""

if len(keypoints) == 0:

return "No hand detected"


# 获取手部关键点（假设第一个检测到的人）

hand_keypoints = keypoints[0] # 第一个人的所有关键点


# 简单的挥手检测

if self._is_waving(hand_keypoints, scores):

return "Waving"


# 握拳检测

if self._is_fist(hand_keypoints, scores):

return "Fist"


# 手掌张开检测

if self._is_open_hand(hand_keypoints, scores):

return "Open Hand"


# 点赞检测

if self._is_thumbs_up(hand_keypoints, scores):

return "Thumbs Up"


return "Unknown Gesture"


def _is_waving(self, keypoints, scores):

"""检测挥手动作"""

# 简单的挥手逻辑：手腕移动幅度较大

wrist = keypoints[self.keypoint_indices['wrist']]

# 这里需要多帧分析，单帧难以判断

return False


def _is_fist(self, keypoints, scores):

"""检测握拳"""

try:

# 检查所有指尖是否靠近手掌中心

fingertips = [

self.keypoint_indices['thumb_tip'],

self.keypoint_indices['index_tip'],

self.keypoint_indices['middle_tip'],

self.keypoint_indices['ring_tip'],

self.keypoint_indices['pinky_tip']

]


mcp_points = [

self.keypoint_indices['thumb_mcp'],

self.keypoint_indices['index_mcp'],

self.keypoint_indices['middle_mcp'],

self.keypoint_indices['ring_mcp'],

self.keypoint_indices['pinky_mcp']

]


# 计算指尖到对应 MCP 关节的距离

distances = []

for tip_idx, mcp_idx in zip(fingertips, mcp_points):

if scores[tip_idx] > 0.3 and scores[mcp_idx] > 0.3:

dist = np.linalg.norm(keypoints[tip_idx] - keypoints[mcp_idx])

distances.append(dist)


if len(distances) >= 3:

# 如果距离都很小，可能是握拳

avg_distance = np.mean(distances)

return avg_distance < 30 # 阈值需要调整

except:

pass

return False


def _is_open_hand(self, keypoints, scores):

"""检测张开的手掌"""

try:

fingertips = [

self.keypoint_indices['thumb_tip'],

self.keypoint_indices['index_tip'],

self.keypoint_indices['middle_tip'],

self.keypoint_indices['ring_tip'],

self.keypoint_indices['pinky_tip']

]


# 检查指尖的分散程度

valid_tips = []

for idx in fingertips:

if scores[idx] > 0.3:

valid_tips.append(keypoints[idx])


if len(valid_tips) >= 4:

# 计算凸包面积或指尖间的最大距离

points = np.array(valid_tips)

max_dist = np.max(np.linalg.norm(points - points.mean(axis=0), axis=1))

return max_dist > 50 # 阈值需要调整

except:

pass

return False


def _is_thumbs_up(self, keypoints, scores):

"""检测点赞手势"""

try:

thumb_tip = self.keypoint_indices['thumb_tip']

index_tip = self.keypoint_indices['index_tip']


if scores[thumb_tip] > 0.3 and scores[index_tip] > 0.3:

# 拇指向上，其他手指弯曲

thumb_pos = keypoints[thumb_tip]

index_pos = keypoints[index_tip]


# 拇指在食指上方

return thumb_pos[1] < index_pos[1] - 20

except:

pass

return False


class GestureTTSController:

"""手势控制 TTS 系统"""


def __init__(self, model_path, conf_threshold=0.2, iou_threshold=0.45):

# 初始化姿态检测器

self.detector = Yolov8Pose(model_path, conf_threshold, iou_threshold)

self.gesture_recognizer = GestureRecognizer()


# 初始化 TTS 模型

self.tts_model = None

if TTS_AVAILABLE:

try:

self.tts_model = TTSModel()

warm_up = "欢迎使用进迭时空手势识别系统"

self.tts_model.ort_predict(warm_up)

print("✅ TTS 模型初始化成功!")

except Exception as e:

print(f"❌ TTS 模型初始化失败: {e}")

self.tts_model = None


# 手势检测状态

self.gesture_running = False

self.last_gesture_time = 0

self.gesture_cooldown = 3 # 手势识别冷却时间（秒）


# 用于存储最后一帧的关键点信息

self.last_keypoints = []

self.last_scores = []


def speak(self, text):

"""使用 TTS 播放文本"""

if self.tts_model is None:

print(f" [TTS不可用] {text}")

return


try:

print(f" 播放: {text}")

output_audio = self.tts_model.ort_predict(text)

play_audio(output_audio)

except Exception as e:

print(f"❌ TTS 播放失败: {e}")


def handle_gesture(self, gesture_type):

"""处理识别到的手势"""

current_time = time.time()


# 防止过于频繁的识别

if current_time - self.last_gesture_time < self.gesture_cooldown:

return


self.last_gesture_time = current_time


# 根据手势类型执行相应操作

gesture_actions = {

"Thumbs Up": "开启人脸识别系统",

"Waving": "你好，检测到挥手动作",

"Both Hands Raised": "紧急情况，启动安全模式"

}


if gesture_type in gesture_actions:

action_text = gesture_actions[gesture_type]

print(f" 识别到手势: {gesture_type} -> {action_text}")

self.speak(action_text)


# 如果是特定手势，可以触发其他功能

if gesture_type == "Thumbs Up":

self.start_face_recognition()

else:

print(f" 识别到手势: {gesture_type}")


def start_face_recognition(self):

"""启动人脸识别功能"""

print(" 启动人脸识别系统...")

self.speak("人脸识别系统已启动，正在检测面部特征")

# 这里可以添加调用人脸识别模块的代码


def process_frame(self, frame):

"""处理单帧图像并返回结果图像"""

# 姿态检测推理

result_image = self.detector.infer(frame)


# 获取关键点信息（这里需要根据 Yolov8Pose 的实际实现调整）

# 假设 detector 有属性存储最后的关键点

if hasattr(self.detector, 'last_keypoints'):

keypoints = self.detector.last_keypoints

scores = getattr(self.detector, 'last_scores', [1.0] * len(keypoints[0])) if keypoints else []


# 保存关键点信息用于手势识别

self.last_keypoints = keypoints

self.last_scores = scores


# 手势识别

if keypoints and self.gesture_running:

gesture = self.gesture_recognizer.recognize_gesture(keypoints, scores)

if gesture:

self.handle_gesture(gesture)


# 在图像上显示手势检测状态

status_text = "手势检测: " + ("开启" if self.gesture_running else "关闭")

cv2.putText(result_image, status_text, (20, 40),

cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)


return result_image


def process_image(self, image_path, output_path='gesture_result.jpg'):

"""处理单张图片"""

# 加载图像

image = cv2.imread(image_path)

if image is None:

print(f"❌ 无法读取图像: {image_path}")

return False


# 处理图像

result_image = self.process_frame(image)


# 保存结果图像

cv2.imwrite(output_path, result_image)

print(f"✅ 结果保存至: {output_path}")

return True


def start_camera_detection(self, camera_id=0):

"""启动摄像头检测"""

cap = cv2.VideoCapture(camera_id)

if not cap.isOpened():

print(f"❌ 无法打开摄像头: {camera_id}")

return False


print(" 开始摄像头手势检测...")

print("操作说明:")

print(" - 按 'g' 开启/关闭手势识别")

print(" - 按 's' 保存当前帧")

print(" - 按 'q' 退出")


self.gesture_running = True


try:

while True:

ret, frame = cap.read()

if not ret:

print("❌ 无法读取摄像头帧")

break


# 处理帧

result_image = self.process_frame(frame)


# 显示结果

cv2.imshow('Gesture Recognition System', result_image)


# 按键处理

key = cv2.waitKey(1) & 0xFF

if key == ord('q'):

break

elif key == ord('g'):

self.gesture_running = not self.gesture_running

status = "开启" if self.gesture_running else "关闭"

print(f" 手势检测: {status}")

elif key == ord('s'):

# 保存当前帧

timestamp = int(time.time())

filename = f'capture_{timestamp}.jpg'

cv2.imwrite(filename, result_image)

print(f" 当前帧已保存: {filename}")


except KeyboardInterrupt:

print("\\\\n 用户中断")

finally:

cap.release()

cv2.destroyAllWindows()

self.gesture_running = False


return True


def main():

parser = argparse.ArgumentParser(description='集成手势识别与TTS系统')

parser.add_argument('--model', type=str, default='../model/yolov8n-pose.q.onnx', help='YOLOv8姿态模型路径')

parser.add_argument('--image', type=str, help='输入图片路径')

parser.add_argument('--use-camera', action='store_true', help='使用摄像头')

parser.add_argument('--camera-id', type=int, default=0, help='摄像头ID')

parser.add_argument('--conf-threshold', type=float, default=0.2, help='置信度阈值')

parser.add_argument('--iou-threshold', type=float, default=0.45, help='IOU阈值')

parser.add_argument('--output', type=str, default='result.jpg', help='输出图片路径')


args = parser.parse_args()


# 创建控制器

controller = GestureTTSController(

model_path=args.model,

conf_threshold=args.conf_threshold,

iou_threshold=args.iou_threshold

)


if args.use_camera:

# 使用摄像头模式

controller.start_camera_detection(camera_id=args.camera_id)

elif args.image:

# 使用图片模式

controller.process_image(args.image, args.output)


# 图片模式下也进行手势识别

if controller.last_keypoints:

gesture = controller.gesture_recognizer.recognize_gesture(

controller.last_keypoints, controller.last_scores

)

if gesture:

print(f" 图片中识别到手势: {gesture}")

controller.handle_gesture(gesture)

else:

print("❌ 请提供图片路径或使用 --use-camera 选项")

print("使用示例:")

print(" python integrated_system.py --image path/to/image.jpg")

print(" python integrated_system.py --use-camera")

print(" python integrated_system.py --use-camera --camera-id 1")


if __name__ == "__main__":

main()

先看看运行的提示和相关的结果：

最初main函数逻辑不完善输出的是这样的：python integrated_system.py --image /home/yinxx/Desktop/thumbs_up.jpg

load models take 9284.161806106567ms

Text split to sentences.

欢迎使用进迭时空手势识别系统

===========================

split_sentences_into_pieces take 3.3266544342041016ms

Sentence[0]: 欢迎使用进迭时空手势识别系统

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.

Load language module take 34801.37300491333ms

Building prefix dict from the default dictionary ...

Loading model from cache /tmp/jieba.cache

Loading model cost 5.676 seconds.

Prefix dict has been built successfully.

encoder run take 238.28ms

Decode slice[0]: decoder run take 7301.61ms

Decode slice[1]: decoder run take 7165.67ms

Decode slice[2]: decoder run take 7174.27ms

Decode slice[3]: decoder run take 7155.17ms

Save to /tmp/tmp5dfcg390.wav

✅ TTS 模型初始化成功!

✅ 结果保存至: result.jpg

于是随后修改添加了一些逻辑：

def main():

    parser = argparse.ArgumentParser(description='集成手势识别与TTS系统')

    parser.add_argument('--model', type=str, default='../model/yolov8n-pose.q.onnx', help='YOLOv8姿态模型路径')

    parser.add_argument('--image', type=str, help='输入图片路径')

    parser.add_argument('--use-camera', action='store_true', help='使用摄像头')

    parser.add_argument('--camera-id', type=int, default=0, help='摄像头ID')

    parser.add_argument('--conf-threshold', type=float, default=0.2, help='置信度阈值')

    parser.add_argument('--iou-threshold', type=float, default=0.45, help='IOU阈值')

    parser.add_argument('--output', type=str, default='result.jpg', help='输出图片路径')

    args = parser.parse_args()

    # 创建控制器

    controller = GestureTTSController(

        model_path=args.model,

        conf_threshold=args.conf_threshold,

        iou_threshold=args.iou_threshold

    )

    if args.use_camera:

        # 使用摄像头模式 (此部分逻辑不变)

        controller.start_camera_detection(camera_id=args.camera_id)

    elif args.image:

        # --- 使用图片模式 (这是修改的核心) ---

        # 1. 首先，我们不再直接调用 process_image，因为它包含了保存文件的逻辑。

        # 我们直接加载图片，然后调用更核心的 process_frame 方法。

        image = cv2.imread(args.image)

        if image is None:

            print(f"❌ 无法读取图像: {args.image}")

            return # 直接退出

        print(">>> 正在处理静态图片...")

        # 2. 调用 process_frame 处理图像。这个方法内部会进行姿态估计和结果绘制。

        # 重要的是，它还会更新 controller.last_keypoints 和 controller.last_scores。

        result_image = controller.process_frame(image)

        # 3. 保存带有姿态绘制的结果图像

        cv2.imwrite(args.output, result_image)

        print(f"✅ 结果图像已保存至: {args.output}")

        # 4. 检查是否有检测到关键点

        if controller.last_keypoints:

            print(">>> 检测到关键点，开始手势识别...")

            # 5. 进行手势识别

            gesture = controller.gesture_recognizer.recognize_gesture(

                controller.last_keypoints, controller.last_scores

            )

            # 6. 如果识别出有效手势 (不是 "Unknown Gesture" 或 "No hand detected")

            if gesture and gesture not in ["Unknown Gesture", "No hand detected"]:

                # 7. 【关键修改】调用 handle_gesture 来触发语音播报和后续动作！

                print(f">>> 静态图片中识别到手势: {gesture}")

                controller.handle_gesture(gesture)

            else:

                print(">>> 静态图片中未识别到明确手势。")

        else:

            print(">>> 静态图片中未检测到人体关键点。")

    else:

        # 提示信息 (此部分逻辑不变)

        print("❌ 请提供图片路径或使用 --use-camera 选项")

        print("使用示例:")

        print(" python integrated_system.py --image path/to/image.jpg")

        print(" python integrated_system.py --use-camera")

if __name__ == "__main__":

    main()