首先按照官方文档进行了相关流程的熟悉。
目前获得一个可以基于yolov8的pose模型进行识别语音播放等逻辑一起执行的文件。
好吧,还是先上代码?
这个只能说是前期的结果。
import os
import sys
import threading
import time
import cv2
import numpy as np
import argparse
from utils import Yolov8Pose
try:
from spacemit_tts import TTSModel, play_audio, play_wav, play_wav_non_blocking
TTS_AVAILABLE = True
except ImportError:
print("⚠️ TTS 模块不可用")
TTS_AVAILABLE = False
class GestureRecognizer:
"""手势识别器"""
def __init__(self):
self.keypoint_indices = {
'wrist': 0,
'thumb_cmc': 1, 'thumb_mcp': 2, 'thumb_ip': 3, 'thumb_tip': 4,
'index_mcp': 5, 'index_pip': 6, 'index_dip': 7, 'index_tip': 8,
'middle_mcp': 9, 'middle_pip': 10, 'middle_dip': 11, 'middle_tip': 12,
'ring_mcp': 13, 'ring_pip': 14, 'ring_dip': 15, 'ring_tip': 16,
'pinky_mcp': 17, 'pinky_pip': 18, 'pinky_dip': 19, 'pinky_tip': 20
}
def recognize_gesture(self, keypoints, scores):
"""识别手势"""
if len(keypoints) == 0:
return "No hand detected"
hand_keypoints = keypoints[0]
if self._is_waving(hand_keypoints, scores):
return "Waving"
if self._is_fist(hand_keypoints, scores):
return "Fist"
if self._is_open_hand(hand_keypoints, scores):
return "Open Hand"
if self._is_thumbs_up(hand_keypoints, scores):
return "Thumbs Up"
return "Unknown Gesture"
def _is_waving(self, keypoints, scores):
"""检测挥手动作"""
wrist = keypoints[self.keypoint_indices['wrist']]
return False
def _is_fist(self, keypoints, scores):
"""检测握拳"""
try:
fingertips = [
self.keypoint_indices['thumb_tip'],
self.keypoint_indices['index_tip'],
self.keypoint_indices['middle_tip'],
self.keypoint_indices['ring_tip'],
self.keypoint_indices['pinky_tip']
]
mcp_points = [
self.keypoint_indices['thumb_mcp'],
self.keypoint_indices['index_mcp'],
self.keypoint_indices['middle_mcp'],
self.keypoint_indices['ring_mcp'],
self.keypoint_indices['pinky_mcp']
]
distances = []
for tip_idx, mcp_idx in zip(fingertips, mcp_points):
if scores[tip_idx] > 0.3 and scores[mcp_idx] > 0.3:
dist = np.linalg.norm(keypoints[tip_idx] - keypoints[mcp_idx])
distances.append(dist)
if len(distances) >= 3:
avg_distance = np.mean(distances)
return avg_distance < 30
except:
pass
return False
def _is_open_hand(self, keypoints, scores):
"""检测张开的手掌"""
try:
fingertips = [
self.keypoint_indices['thumb_tip'],
self.keypoint_indices['index_tip'],
self.keypoint_indices['middle_tip'],
self.keypoint_indices['ring_tip'],
self.keypoint_indices['pinky_tip']
]
valid_tips = []
for idx in fingertips:
if scores[idx] > 0.3:
valid_tips.append(keypoints[idx])
if len(valid_tips) >= 4:
points = np.array(valid_tips)
max_dist = np.max(np.linalg.norm(points - points.mean(axis=0), axis=1))
return max_dist > 50
except:
pass
return False
def _is_thumbs_up(self, keypoints, scores):
"""检测点赞手势"""
try:
thumb_tip = self.keypoint_indices['thumb_tip']
index_tip = self.keypoint_indices['index_tip']
if scores[thumb_tip] > 0.3 and scores[index_tip] > 0.3:
thumb_pos = keypoints[thumb_tip]
index_pos = keypoints[index_tip]
return thumb_pos[1] < index_pos[1] - 20
except:
pass
return False
class GestureTTSController:
"""手势控制 TTS 系统"""
def __init__(self, model_path, conf_threshold=0.2, iou_threshold=0.45):
self.detector = Yolov8Pose(model_path, conf_threshold, iou_threshold)
self.gesture_recognizer = GestureRecognizer()
self.tts_model = None
if TTS_AVAILABLE:
try:
self.tts_model = TTSModel()
warm_up = "欢迎使用进迭时空手势识别系统"
self.tts_model.ort_predict(warm_up)
print("✅ TTS 模型初始化成功!")
except Exception as e:
print(f"❌ TTS 模型初始化失败: {e}")
self.tts_model = None
self.gesture_running = False
self.last_gesture_time = 0
self.gesture_cooldown = 3
self.last_keypoints = []
self.last_scores = []
def speak(self, text):
"""使用 TTS 播放文本"""
if self.tts_model is None:
print(f" [TTS不可用] {text}")
return
try:
print(f" 播放: {text}")
output_audio = self.tts_model.ort_predict(text)
play_audio(output_audio)
except Exception as e:
print(f"❌ TTS 播放失败: {e}")
def handle_gesture(self, gesture_type):
"""处理识别到的手势"""
current_time = time.time()
if current_time - self.last_gesture_time < self.gesture_cooldown:
return
self.last_gesture_time = current_time
gesture_actions = {
"Thumbs Up": "开启人脸识别系统",
"Waving": "你好,检测到挥手动作",
"Both Hands Raised": "紧急情况,启动安全模式"
}
if gesture_type in gesture_actions:
action_text = gesture_actions[gesture_type]
print(f" 识别到手势: {gesture_type} -> {action_text}")
self.speak(action_text)
if gesture_type == "Thumbs Up":
self.start_face_recognition()
else:
print(f" 识别到手势: {gesture_type}")
def start_face_recognition(self):
"""启动人脸识别功能"""
print(" 启动人脸识别系统...")
self.speak("人脸识别系统已启动,正在检测面部特征")
def process_frame(self, frame):
"""处理单帧图像并返回结果图像"""
result_image = self.detector.infer(frame)
if hasattr(self.detector, 'last_keypoints'):
keypoints = self.detector.last_keypoints
scores = getattr(self.detector, 'last_scores', [1.0] * len(keypoints[0])) if keypoints else []
self.last_keypoints = keypoints
self.last_scores = scores
if keypoints and self.gesture_running:
gesture = self.gesture_recognizer.recognize_gesture(keypoints, scores)
if gesture:
self.handle_gesture(gesture)
status_text = "手势检测: " + ("开启" if self.gesture_running else "关闭")
cv2.putText(result_image, status_text, (20, 40),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
return result_image
def process_image(self, image_path, output_path='gesture_result.jpg'):
"""处理单张图片"""
image = cv2.imread(image_path)
if image is None:
print(f"❌ 无法读取图像: {image_path}")
return False
result_image = self.process_frame(image)
cv2.imwrite(output_path, result_image)
print(f"✅ 结果保存至: {output_path}")
return True
def start_camera_detection(self, camera_id=0):
"""启动摄像头检测"""
cap = cv2.VideoCapture(camera_id)
if not cap.isOpened():
print(f"❌ 无法打开摄像头: {camera_id}")
return False
print(" 开始摄像头手势检测...")
print("操作说明:")
print(" - 按 'g' 开启/关闭手势识别")
print(" - 按 's' 保存当前帧")
print(" - 按 'q' 退出")
self.gesture_running = True
try:
while True:
ret, frame = cap.read()
if not ret:
print("❌ 无法读取摄像头帧")
break
result_image = self.process_frame(frame)
cv2.imshow('Gesture Recognition System', result_image)
key = cv2.waitKey(1) & 0xFF
if key == ord('q'):
break
elif key == ord('g'):
self.gesture_running = not self.gesture_running
status = "开启" if self.gesture_running else "关闭"
print(f" 手势检测: {status}")
elif key == ord('s'):
timestamp = int(time.time())
filename = f'capture_{timestamp}.jpg'
cv2.imwrite(filename, result_image)
print(f" 当前帧已保存: {filename}")
except KeyboardInterrupt:
print("\\\\n 用户中断")
finally:
cap.release()
cv2.destroyAllWindows()
self.gesture_running = False
return True
def main():
parser = argparse.ArgumentParser(description='集成手势识别与TTS系统')
parser.add_argument('--model', type=str, default='../model/yolov8n-pose.q.onnx', help='YOLOv8姿态模型路径')
parser.add_argument('--image', type=str, help='输入图片路径')
parser.add_argument('--use-camera', action='store_true', help='使用摄像头')
parser.add_argument('--camera-id', type=int, default=0, help='摄像头ID')
parser.add_argument('--conf-threshold', type=float, default=0.2, help='置信度阈值')
parser.add_argument('--iou-threshold', type=float, default=0.45, help='IOU阈值')
parser.add_argument('--output', type=str, default='result.jpg', help='输出图片路径')
args = parser.parse_args()
controller = GestureTTSController(
model_path=args.model,
conf_threshold=args.conf_threshold,
iou_threshold=args.iou_threshold
)
if args.use_camera:
controller.start_camera_detection(camera_id=args.camera_id)
elif args.image:
controller.process_image(args.image, args.output)
if controller.last_keypoints:
gesture = controller.gesture_recognizer.recognize_gesture(
controller.last_keypoints, controller.last_scores
)
if gesture:
print(f" 图片中识别到手势: {gesture}")
controller.handle_gesture(gesture)
else:
print("❌ 请提供图片路径或使用 --use-camera 选项")
print("使用示例:")
print(" python integrated_system.py --image path/to/image.jpg")
print(" python integrated_system.py --use-camera")
print(" python integrated_system.py --use-camera --camera-id 1")
if __name__ == "__main__":
main()
先看看运行的提示和相关的结果:
最初main函数逻辑不完善输出的是这样的:python integrated_system.py --image /home/yinxx/Desktop/thumbs_up.jpg
load models take 9284.161806106567ms
Text split to sentences.
欢迎使用进迭时空手势识别系统
===========================
split_sentences_into_pieces take 3.3266544342041016ms
Sentence[0]: 欢迎使用进迭时空手势识别系统
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Load language module take 34801.37300491333ms
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 5.676 seconds.
Prefix dict has been built successfully.
encoder run take 238.28ms
Decode slice[0]: decoder run take 7301.61ms
Decode slice[1]: decoder run take 7165.67ms
Decode slice[2]: decoder run take 7174.27ms
Decode slice[3]: decoder run take 7155.17ms
Save to /tmp/tmp5dfcg390.wav
✅ TTS 模型初始化成功!
✅ 结果保存至: result.jpg
于是随后修改添加了一些逻辑:
def main():
parser = argparse.ArgumentParser(description='集成手势识别与TTS系统')
parser.add_argument('--model', type=str, default='../model/yolov8n-pose.q.onnx', help='YOLOv8姿态模型路径')
parser.add_argument('--image', type=str, help='输入图片路径')
parser.add_argument('--use-camera', action='store_true', help='使用摄像头')
parser.add_argument('--camera-id', type=int, default=0, help='摄像头ID')
parser.add_argument('--conf-threshold', type=float, default=0.2, help='置信度阈值')
parser.add_argument('--iou-threshold', type=float, default=0.45, help='IOU阈值')
parser.add_argument('--output', type=str, default='result.jpg', help='输出图片路径')
args = parser.parse_args()
controller = GestureTTSController(
model_path=args.model,
conf_threshold=args.conf_threshold,
iou_threshold=args.iou_threshold
)
if args.use_camera:
controller.start_camera_detection(camera_id=args.camera_id)
elif args.image:
image = cv2.imread(args.image)
if image is None:
print(f"❌ 无法读取图像: {args.image}")
return
print(">>> 正在处理静态图片...")
result_image = controller.process_frame(image)
cv2.imwrite(args.output, result_image)
print(f"✅ 结果图像已保存至: {args.output}")
if controller.last_keypoints:
print(">>> 检测到关键点,开始手势识别...")
gesture = controller.gesture_recognizer.recognize_gesture(
controller.last_keypoints, controller.last_scores
)
if gesture and gesture not in ["Unknown Gesture", "No hand detected"]:
print(f">>> 静态图片中识别到手势: {gesture}")
controller.handle_gesture(gesture)
else:
print(">>> 静态图片中未识别到明确手势。")
else:
print(">>> 静态图片中未检测到人体关键点。")
else:
print("❌ 请提供图片路径或使用 --use-camera 选项")
print("使用示例:")
print(" python integrated_system.py --image path/to/image.jpg")
print(" python integrated_system.py --use-camera")
if __name__ == "__main__":
main()
这样的话,就比较清晰执行了什么逻辑:
我希望能够成功检查这个手势,但是运行后给出的提示是

后来发现官方并没有提供相关的手势识别的模型?
于是我打算在网上找一下开源的 模型试一试?
https://github.com/RionDsilvaCS/yolo-hand-pose
看看能不能再开发板上面进行训练并得到对应的模型文件
不过,官方只提供了x86的docker环境,没有riscv64的?头疼啊