ailai_image_point_diff/ailai_pc/detect.py

import cv2
import numpy as np
from rknnlite.api import RKNNLite

MODEL_PATH = "detect.rknn"
CLASS_NAMES = ["bag"]  # 单类


class Yolo11Detector:
    def __init__(self, model_path):
        self.rknn = RKNNLite(verbose=False)

        # 加载 RKNN 模型
        ret = self.rknn.load_rknn(model_path)
        assert ret == 0, "❌ Load RKNN model failed"

        # 初始化运行时（使用 NPU 核心0）
        ret = self.rknn.init_runtime(core_mask=RKNNLite.NPU_CORE_0)
        assert ret == 0, "❌ Init runtime failed"

        # 模型输入大小
        self.input_size = 640
        # YOLO anchors（根据你训练的模型）
        self.anchors = {
            8: [[10, 13], [16, 30], [33, 23]],
            16: [[30, 61], [62, 45], [59, 119]],
            32: [[116, 90], [156, 198], [373, 326]]
        }

    def preprocess(self, img):
        """高性能预处理：缩放+RGB"""
        h, w = img.shape[:2]
        scale = min(self.input_size / w, self.input_size / h)
        new_w, new_h = int(w * scale), int(h * scale)
        img_resized = cv2.resize(img, (new_w, new_h))
        canvas = np.full((self.input_size, self.input_size, 3), 114, dtype=np.uint8)
        dw, dh = (self.input_size - new_w) // 2, (self.input_size - new_h) // 2
        canvas[dh:dh + new_h, dw:dw + new_w, :] = img_resized
        img_rgb = cv2.cvtColor(canvas, cv2.COLOR_BGR2RGB)
        return np.expand_dims(img_rgb, 0).astype(np.uint8), scale, dw, dh

    def postprocess(self, outputs, scale, dw, dh, conf_thresh=0.25, iou_thresh=0.45):
        """解析 YOLO 输出"""
        # 注意：根据 RKNN 输出节点选择
        preds = outputs[0].reshape(-1, outputs[0].shape[1])  # 假设输出 [1, N, C]
        boxes, scores, class_ids = [], [], []

        for p in preds:
            conf = p[4]
            if conf < conf_thresh:
                continue
            cls_conf = p[5]  # 单类模型
            score = conf * cls_conf
            if score < conf_thresh:
                continue
            cx, cy, w, h = p[:4]
            x1 = (cx - w / 2 - dw) / scale
            y1 = (cy - h / 2 - dh) / scale
            x2 = (cx + w / 2 - dw) / scale
            y2 = (cy + h / 2 - dh) / scale
            boxes.append([x1, y1, x2, y2])
            scores.append(score)
            class_ids.append(0)  # 单类

        if len(boxes) == 0:
            return []

        boxes = np.array(boxes)
        scores = np.array(scores)
        class_ids = np.array(class_ids)

        # 简单 NMS
        idxs = np.argsort(scores)[::-1]
        keep = []
        while len(idxs) > 0:
            i = idxs[0]
            keep.append(i)
            if len(idxs) == 1:
                break
            x1, y1, x2, y2 = boxes[i]
            xx1 = np.maximum(x1, boxes[idxs[1:], 0])
            yy1 = np.maximum(y1, boxes[idxs[1:], 1])
            xx2 = np.minimum(x2, boxes[idxs[1:], 2])
            yy2 = np.minimum(y2, boxes[idxs[1:], 3])
            inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
            area_i = (x2 - x1) * (y2 - y1)
            area_j = (boxes[idxs[1:], 2] - boxes[idxs[1:], 0]) * (boxes[idxs[1:], 3] - boxes[idxs[1:], 1])
            iou = inter / (area_i + area_j - inter + 1e-6)
            idxs = idxs[1:][iou < iou_thresh]

        results = []
        for i in keep:
            results.append({
                "box": boxes[i],
                "score": scores[i],
                "class_id": class_ids[i]
            })
        return results

    def detect(self, img):
        img_data, scale, dw, dh = self.preprocess(img)
        outputs = self.rknn.inference([img_data])
        results = self.postprocess(outputs, scale, dw, dh)
        return results

    def release(self):
        self.rknn.release()


if __name__ == "__main__":
    detector = Yolo11Detector(MODEL_PATH)
    cap = cv2.VideoCapture(0)  # 可以换成图片路径

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        results = detector.detect(frame)

        for r in results:
            x1, y1, x2, y2 = map(int, r["box"])
            cls_id = r["class_id"]
            score = r["score"]
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f"{CLASS_NAMES[cls_id]} {score:.2f}", (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

        cv2.imshow("YOLOv11 Detection", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    detector.release()
    cap.release()