AutoControlSystem-git/Vision/yolo/yolov8_pt_seg.py


#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project -> File   ：yolov8_segment.py
@IDE    ：PyCharm
@Author ：hjw
@Version : 1.0.0
@Date   ：2024/8/20 9:25
@Function   ：
'''

# yolov8 pt模型，实例分割推理
import cv2
import time
import numpy as np
import torch, torchvision
import torch.nn.functional as F


def load_model(model_path, device):
    model = torch.load(model_path, map_location=device)
    category_list = model.get('CLASSES', model.get('model').names)
    model = (model.get('ema') or model['model']).float()  # FP32 model
    model.__setattr__('CLASSES', category_list)
    model.fuse().eval()
    #model = model.cuda()
    return model


def data_preprocess(model, img, img_scale, device):
    stride, auto = 32, True
    stride = max(int(model.stride.max()), 32)
    img = letterbox(img, new_shape=img_scale, stride=stride, auto=auto)[0]  # padded resize
    img = np.ascontiguousarray(img.transpose((2, 0, 1))[::-1])  # HWC to CHW, BGR to RGB,contiguous
    #img = torch.from_numpy(img) # ndarray to tensor
    img = torch.from_numpy(img).to(device)
    #img = torch.from_numpy(img)
    img = img.float()  # uint8 to fp32
    img /= 255  # 0 - 255 to 0.0 - 1.0
    if len(img.shape) == 3:
        img = img[None]  # expand for batch dim
    return img


def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)


def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
                        labels=(), max_det=300, nc=0, max_time_img=0.05, max_nms=30000, max_wh=7680, ):
    # Checks
    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
    if isinstance(prediction, (list, tuple)):  # YOLOv8 model in validation model, output = (inference_out, loss_out)
        prediction = prediction[0]  # select only inference output

    device = prediction.device
    mps = 'mps' in device.type  # Apple MPS
    if mps:  # MPS not fully supported yet, convert tensors to CPU before NMS
        prediction = prediction.cpu()
    bs = prediction.shape[0]  # batch size
    nc = nc or (prediction.shape[1] - 4)  # number of classes
    nm = prediction.shape[1] - nc - 4
    mi = 4 + nc  # mask start index
    xc = prediction[:, 4:mi].amax(1) > conf_thres  # candidates

    # Settings
    # min_wh = 2  # (pixels) minimum box width and height
    time_limit = 0.5 + max_time_img * bs  # seconds to quit after
    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)

    prediction = prediction.transpose(-1, -2)  # shape(1,84,6300) to shape(1,6300,84)
    prediction[..., :4] = xywh2xyxy(prediction[..., :4])  # xywh to xyxy

    t = time.time()
    output = [torch.zeros((0, 6 + nm), device=prediction.device)] * bs
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # Cat apriori labels if autolabelling
        if labels and len(labels[xi]):
            lb = labels[xi]
            v = torch.zeros((len(lb), nc + nm + 4), device=x.device)
            v[:, :4] = xywh2xyxy(lb[:, 1:5])  # box
            v[range(len(lb)), lb[:, 0].long() + 4] = 1.0  # cls
            x = torch.cat((x, v), 0)

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Detections matrix nx6 (xyxy, conf, cls)
        box, cls, mask = x.split((4, nc, nm), 1)

        if multi_label:
            i, j = torch.where(cls > conf_thres)
            x = torch.cat((box[i], x[i, 4 + j, None], j[:, None].float(), mask[i]), 1)
        else:  # best class only
            conf, j = cls.max(1, keepdim=True)
            x = torch.cat((box, conf, j.float(), mask), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes is not None:
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        if n > max_nms:  # excess boxes
            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence and remove excess boxes

        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        i = i[:max_det]  # limit detections

        output[xi] = x[i]
        if mps:
            output[xi] = output[xi].to(device)
        if (time.time() - t) > time_limit:
            print(f'WARNING ⚠️ NMS time limit {time_limit:.3f}s exceeded')
            break  # time limit exceeded
    return output


def xywh2xyxy(x):
    """
    Convert bounding box coordinates from (x, y, width, height) format to (x1, y1, x2, y2) format where (x1, y1) is the
    top-left corner and (x2, y2) is the bottom-right corner.
    Args:
        x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x, y, width, height) format.
    Returns:
        y (np.ndarray | torch.Tensor): The bounding box coordinates in (x1, y1, x2, y2) format.
    """
    assert x.shape[-1] == 4, f'input shape last dimension expected 4 but input shape is {x.shape}'
    y = torch.empty_like(x) if isinstance(x, torch.Tensor) else np.empty_like(x)  # faster than clone/copy
    dw = x[..., 2] / 2  # half-width
    dh = x[..., 3] / 2  # half-height
    y[..., 0] = x[..., 0] - dw  # top left x
    y[..., 1] = x[..., 1] - dh  # top left y
    y[..., 2] = x[..., 0] + dw  # bottom right x
    y[..., 3] = x[..., 1] + dh  # bottom right y
    return y


def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True):
    """
    Rescales bounding boxes (in the format of xyxy) from the shape of the image they were originally specified in
    (img1_shape) to the shape of a different image (img0_shape).
    Args:
        img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
        boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
        img0_shape (tuple): the shape of the target image, in the format of (height, width).
        ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
            calculated based on the size difference between the two images.
        padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
            rescaling.
    Returns:
        boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
    """
    if ratio_pad is None:  # calculate from img0_shape
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
        pad = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1), round(
            (img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)  # wh padding
    else:
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]

    if padding:
        boxes[..., [0, 2]] -= pad[0]  # x padding
        boxes[..., [1, 3]] -= pad[1]  # y padding
    boxes[..., :4] /= gain
    clip_boxes(boxes, img0_shape)
    return boxes


def clip_boxes(boxes, shape):
    """
    Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.

    Args:
      boxes (torch.Tensor): the bounding boxes to clip
      shape (tuple): the shape of the image
    """
    if isinstance(boxes, torch.Tensor):  # faster individually
        boxes[..., 0].clamp_(0, shape[1])  # x1
        boxes[..., 1].clamp_(0, shape[0])  # y1
        boxes[..., 2].clamp_(0, shape[1])  # x2
        boxes[..., 3].clamp_(0, shape[0])  # y2
    else:  # np.array (faster grouped)
        boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
        boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2


def process_mask(protos, masks_in, bboxes, shape, ori_shape):
    """
    Crop after upsample.
    proto_out: [mask_dim, mask_h, mask_w]
    out_masks: [n, mask_dim], n is number of masks after nms
    bboxes: [n, 4], n is number of masks after nms
    shape:input_image_size, (h, w)

    return: h, w, n
    """
    # mask转换成自定义尺寸
    c, mh, mw = protos.shape  # CHW
    masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw)
    masks = F.interpolate(masks[None], shape, mode='bilinear', align_corners=False)[0]  # CHW
    # mask转换成原图尺寸
    gain = min(shape[0] / ori_shape[0], shape[1] / ori_shape[1])  # gain  = old / new
    pad = (shape[1] - ori_shape[1] * gain) / 2, (shape[0] - ori_shape[0] * gain) / 2  # wh padding
    top, left = int(pad[1]), int(pad[0])  # y, x
    bottom, right = int(shape[0] - pad[1]), int(shape[1] - pad[0])
    if len(masks.shape) < 2:
        raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
    masks = masks[:, top:bottom, left:right]
    masks = F.interpolate(masks[None], ori_shape, mode='bilinear', align_corners=False)[0]  # CHW
    # 裁去box以外的图像
    crop_masks = []
    for i, mask in enumerate(masks):
        mask = mask[int(bboxes[i][1]):int(bboxes[i][3]), int(bboxes[i][0]):int(bboxes[i][2])]
        crop_masks.append(mask.gt_(0.5))
    return crop_masks


def plot_result(det_cpu, dst_img, masks, category_names):
    circle_max_contour = []
    concrete_max_contour = []
    for i, item in enumerate(det_cpu):
        # rand_color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
        # 画box
        box_x1, box_y1, box_x2, box_y2 = item[0:4].astype(np.int32)
        label = category_names[int(item[5])]
        rand_color = (0, 255, 255)
        #cv2.rectangle(dst_img, (box_x1, box_y1), (box_x2, box_y2), color=rand_color, thickness=2)
        score = item[4]
        org = (int((box_x1+box_x2)/2), int((box_y1+box_y2)/2))
        text = '{}|{:.2f}'.format(label, score)
        cv2.putText(dst_img, text, org=org, fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.8, color=rand_color, thickness=2)
        # 画mask
        #mask = masks[i].cpu().numpy().astype(int)
        mask = masks[i].cpu().data.numpy().astype(int)
        #mask = masks[i].numpy().astype(int)
        bbox_image = dst_img[box_y1:box_y2, box_x1:box_x2]
        h, w = box_y2 - box_y1, box_x2 - box_x1
        mask_colored = np.zeros((h, w, 3), dtype=np.uint8)
        mask_colored[np.where(mask)] = rand_color
        ##################################
        imgray = cv2.cvtColor(mask_colored, cv2.COLOR_BGR2GRAY)
        # cv2.imshow('mask',imgray)
        # cv2.waitKey(1)
        # 2、二进制图像
        ret, binary = cv2.threshold(imgray, 10, 255, 0)
        # 阈值 二进制图像
        # cv2.imshow('bin',binary)
        # cv2.waitKey(1)
        contours, hierarchy = cv2.findContours(binary, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
        max_contour = None
        max_perimeter = 0
        for contour in contours:
            perimeter = cv2.arcLength(contour, True)
            if perimeter > max_perimeter:
                max_perimeter = perimeter
                max_contour = contour
        rect = cv2.minAreaRect(max_contour)
        # cv2.boxPoints可以将轮廓点转换为四个角点坐标
        box = cv2.boxPoints(rect)
        # 这一步不影响后面的画图，但是可以保证四个角点坐标为顺时针
        startidx = box.sum(axis=1).argmin()
        box = np.roll(box, 4 - startidx, 0)
        # 在原图上画出预测的外接矩形
        box = box.reshape((-1, 1, 2)).astype(np.int32)
        box = box + [[[box_x1, box_y1]], [[box_x1, box_y1]], [[box_x1, box_y1]], [[box_x1, box_y1]]]
        cv2.polylines(dst_img, [box], True, (0, 255, 0), 2)

    return dst_img
    # cv2.imwrite('rs.jpg', dst_img)


class yolov8_segment():
    def __init__(self):
        super(yolov8_segment, self).__init__()


    def load_model(self, model_path, device):
        self.model = load_model(model_path, device)
        self.device = device

    def model_inference(self, frame, upd_arr):
        img = data_preprocess(self.model, frame, [640, 640], self.device)

        # 推理
        ori_img = frame.copy()
        result = self.model(img, augment=False)
        preds = result[0]
        proto = result[1][-1]
        # NMS
        det = non_max_suppression(preds, conf_thres=0.25, iou_thres=0.3, nc=len(self.model.CLASSES))[0]
        if det.shape[0] != 0:
            # bbox还原至原图尺寸
            det[:, :4] = scale_boxes(img.shape[2:], det[:, :4], ori_img.shape)
            # mask转换成原图尺寸并做裁剪
            masks = process_mask(proto[0], det[:, 6:], det[:, :4], img.shape[2:], ori_img.shape[0:2])
            category_names = self.model.CLASSES
            # 画图
            # result_frame = plot_result(det.cpu().data.numpy(), ori_img, masks, category_names)
            return 1 , det.cpu().data.numpy(), ori_img, masks, category_names
        else:
            return 0 , None, None, None, None

    def clear(self):
        del self.model

# model = yolov8_segment()
# model.load_model('./pt_model/yolov8n-seg.pt','cpu')
# cap = cv2.VideoCapture(1)
# while True:
#     # count_file = len(os.listdir('E:\\A_panckg\\cv_sdk_discharge\\video_save'))  # 数量
#     ret, frame = cap.read()
#     if ret:
#         frame_save_count = 1000
#         frame = cv2.resize(frame, (1280, 720))
#         img = model.model_inference(frame, 0)
#         cv2.imshow("imgrr", img)
#         cv2.waitKey(1)
#         #videoWriter(img)