import axengine as axe import numpy as np import cv2 import argparse from dataclasses import dataclass # COCO Class Names COCO_CLASSES = [ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' ] @dataclass class Object: bbox: list # [x0, y0, width, height] label: int prob: float def sigmoid(x): return 1 / (1 + np.exp(-x)) def softmax(x, axis=-1): x = x - np.max(x, axis=axis, keepdims=True) e_x = np.exp(x) return e_x / np.sum(e_x, axis=axis, keepdims=True) def decode_distributions(feat, reg_max=16): prob = softmax(feat, axis=-1) dis = np.sum(prob * np.arange(reg_max), axis=-1) return dis def preprocess(image_path, input_size): image = cv2.imread(image_path) if image is None: raise FileNotFoundError(f"Unable to read image file: {image_path}") original_shape = image.shape[:2] image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) resized_image = cv2.resize(image, input_size) input_tensor = np.expand_dims(resized_image, axis=0).astype(np.uint8) return input_tensor, original_shape, image def postprocess(outputs, original_shape, input_size, confidence_threshold, nms_threshold, reg_max=16): heads = [ {'output': outputs[0], 'grid_size': input_size[0] // 8, 'stride': 8}, {'output': outputs[1], 'grid_size': input_size[0] // 16, 'stride': 16}, {'output': outputs[2], 'grid_size': input_size[0] // 32, 'stride': 32} ] detections = [] num_classes = 80 bbox_channels = 4 * reg_max class_channels = num_classes for head in heads: output = head['output'] batch_size, grid_h, grid_w, channels = output.shape stride = head['stride'] bbox_part = output[:, :, :, :bbox_channels] class_part = output[:, :, :, bbox_channels:] bbox_part = bbox_part.reshape(batch_size, grid_h, grid_w, 4, reg_max) bbox_part = bbox_part.reshape(grid_h * grid_w, 4, reg_max) class_part = class_part.reshape(batch_size, grid_h * grid_w, class_channels) for b in range(batch_size): for i in range(grid_h * grid_w): h = i // grid_w w = i % grid_w class_scores = class_part[b, i, :] class_id = np.argmax(class_scores) class_score = class_scores[class_id] box_prob = sigmoid(class_score) if box_prob < confidence_threshold: continue bbox = bbox_part[i, :, :] dis_left = decode_distributions(bbox[0, :], reg_max) dis_top = decode_distributions(bbox[1, :], reg_max) dis_right = decode_distributions(bbox[2, :], reg_max) dis_bottom = decode_distributions(bbox[3, :], reg_max) pb_cx = (w + 0.5) * stride pb_cy = (h + 0.5) * stride x0 = pb_cx - dis_left * stride y0 = pb_cy - dis_top * stride x1 = pb_cx + dis_right * stride y1 = pb_cy + dis_bottom * stride scale_x = original_shape[1] / input_size[0] scale_y = original_shape[0] / input_size[1] x0 = np.clip(x0 * scale_x, 0, original_shape[1] - 1) y0 = np.clip(y0 * scale_y, 0, original_shape[0] - 1) x1 = np.clip(x1 * scale_x, 0, original_shape[1] - 1) y1 = np.clip(y1 * scale_y, 0, original_shape[0] - 1) width = x1 - x0 height = y1 - y0 detections.append(Object( bbox=[float(x0), float(y0), float(width), float(height)], label=int(class_id), prob=float(box_prob) )) if len(detections) == 0: return [] boxes = np.array([d.bbox for d in detections]) scores = np.array([d.prob for d in detections]) class_ids = np.array([d.label for d in detections]) final_detections = [] unique_classes = np.unique(class_ids) for cls in unique_classes: idxs = np.where(class_ids == cls)[0] cls_boxes = boxes[idxs] cls_scores = scores[idxs] x1_cls = cls_boxes[:, 0] y1_cls = cls_boxes[:, 1] x2_cls = cls_boxes[:, 0] + cls_boxes[:, 2] y2_cls = cls_boxes[:, 1] + cls_boxes[:, 3] areas = (x2_cls - x1_cls) * (y2_cls - y1_cls) order = cls_scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) if order.size == 1: break xx1 = np.maximum(x1_cls[i], x1_cls[order[1:]]) yy1 = np.maximum(y1_cls[i], y1_cls[order[1:]]) xx2 = np.minimum(x2_cls[i], x2_cls[order[1:]]) yy2 = np.minimum(y2_cls[i], y2_cls[order[1:]]) w = np.maximum(0, xx2 - xx1) h = np.maximum(0, yy2 - yy1) intersection = w * h iou = intersection / (areas[i] + areas[order[1:]] - intersection) inds = np.where(iou <= nms_threshold)[0] order = order[inds + 1] for idx in keep: final_detections.append(Object( bbox=cls_boxes[idx].tolist(), label=int(cls), prob=float(cls_scores[idx]) )) return final_detections def main(): parser = argparse.ArgumentParser(description="YOLO11 AXEngine Inference") parser.add_argument('--model', type=str, default='yolo11x.axmodel', help='Model path') parser.add_argument('--image', type=str, default='dog.jpg', help='Image path') parser.add_argument('--conf', type=float, default=0.45, help='Confidence threshold') parser.add_argument('--nms', type=float, default=0.45, help='NMS threshold') parser.add_argument('--size', type=int, nargs=2, default=[640, 640], help='Input size W H') parser.add_argument('--regmax', type=int, default=16, help='DFL reg_max value') args = parser.parse_args() try: input_tensor, original_shape, original_image = preprocess(args.image, tuple(args.size)) except FileNotFoundError as e: print(e) return try: session = axe.InferenceSession(args.model) except Exception as e: print(f"Error loading model: {e}") return input_name = session.get_inputs()[0].name output_names = [output.name for output in session.get_outputs()] try: outputs = session.run(output_names, {input_name: input_tensor}) except Exception as e: print(f"Error during inference: {e}") return try: detections = postprocess( outputs, original_shape, tuple(args.size), args.conf, args.nms, reg_max=args.regmax ) except Exception as e: print(f"Error during post-processing: {e}") return for det in detections: bbox = det.bbox score = det.prob class_id = det.label if class_id >= len(COCO_CLASSES): label = f"cls{class_id}: {score:.2f}" else: label = f"{COCO_CLASSES[class_id]}: {score:.2f}" x, y, w, h = map(int, bbox) cv2.rectangle(original_image, (x, y), (x + w, y + h), (0, 255, 0), 2) cv2.putText(original_image, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) cv2.imwrite('detections.png', cv2.cvtColor(original_image, cv2.COLOR_RGB2BGR)) print("结果已保存到 detections.png") if __name__ == '__main__': main()