import axengine as axe
import numpy as np
import cv2
import argparse
from dataclasses import dataclass

# COCO Class Names
COCO_CLASSES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog',
    'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
    'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
    'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
    'teddy bear', 'hair drier', 'toothbrush'
]

@dataclass
class Object:
    bbox: list  # [x0, y0, width, height]
    label: int
    prob: float

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x, axis=-1):
    x = x - np.max(x, axis=axis, keepdims=True)
    e_x = np.exp(x)
    return e_x / np.sum(e_x, axis=axis, keepdims=True)

def decode_distributions(feat, reg_max=16):
    prob = softmax(feat, axis=-1)
    dis = np.sum(prob * np.arange(reg_max), axis=-1)
    return dis

def preprocess(image_path, input_size):
    image = cv2.imread(image_path)
    if image is None:
        raise FileNotFoundError(f"Unable to read image file: {image_path}")
    original_shape = image.shape[:2]
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    resized_image = cv2.resize(image, input_size)
    input_tensor = np.expand_dims(resized_image, axis=0).astype(np.uint8)
    return input_tensor, original_shape, image

def postprocess(outputs, original_shape, input_size, confidence_threshold, nms_threshold, reg_max=16):
    heads = [
        {'output': outputs[0], 'grid_size': input_size[0] // 8, 'stride': 8},
        {'output': outputs[1], 'grid_size': input_size[0] // 16, 'stride': 16},
        {'output': outputs[2], 'grid_size': input_size[0] // 32, 'stride': 32}
    ]
    detections = []
    num_classes = 80
    bbox_channels = 4 * reg_max
    class_channels = num_classes

    for head in heads:
        output = head['output']
        batch_size, grid_h, grid_w, channels = output.shape
        stride = head['stride']
        
        bbox_part = output[:, :, :, :bbox_channels]
        class_part = output[:, :, :, bbox_channels:]
        
        bbox_part = bbox_part.reshape(batch_size, grid_h, grid_w, 4, reg_max)
        bbox_part = bbox_part.reshape(grid_h * grid_w, 4, reg_max)
        class_part = class_part.reshape(batch_size, grid_h * grid_w, class_channels)
        
        for b in range(batch_size):
            for i in range(grid_h * grid_w):
                h = i // grid_w
                w = i % grid_w
                class_scores = class_part[b, i, :]
                class_id = np.argmax(class_scores)
                class_score = class_scores[class_id]
                box_prob = sigmoid(class_score)
                if box_prob < confidence_threshold:
                    continue
                bbox = bbox_part[i, :, :]
                dis_left = decode_distributions(bbox[0, :], reg_max)
                dis_top = decode_distributions(bbox[1, :], reg_max)
                dis_right = decode_distributions(bbox[2, :], reg_max)
                dis_bottom = decode_distributions(bbox[3, :], reg_max)
                pb_cx = (w + 0.5) * stride
                pb_cy = (h + 0.5) * stride
                x0 = pb_cx - dis_left * stride
                y0 = pb_cy - dis_top * stride
                x1 = pb_cx + dis_right * stride
                y1 = pb_cy + dis_bottom * stride
                scale_x = original_shape[1] / input_size[0]
                scale_y = original_shape[0] / input_size[1]
                x0 = np.clip(x0 * scale_x, 0, original_shape[1] - 1)
                y0 = np.clip(y0 * scale_y, 0, original_shape[0] - 1)
                x1 = np.clip(x1 * scale_x, 0, original_shape[1] - 1)
                y1 = np.clip(y1 * scale_y, 0, original_shape[0] - 1)
                width = x1 - x0
                height = y1 - y0
                detections.append(Object(
                    bbox=[float(x0), float(y0), float(width), float(height)],
                    label=int(class_id),
                    prob=float(box_prob)
                ))

    if len(detections) == 0:
        return []
    boxes = np.array([d.bbox for d in detections])
    scores = np.array([d.prob for d in detections])
    class_ids = np.array([d.label for d in detections])

    final_detections = []
    unique_classes = np.unique(class_ids)
    for cls in unique_classes:
        idxs = np.where(class_ids == cls)[0]
        cls_boxes = boxes[idxs]
        cls_scores = scores[idxs]
        x1_cls = cls_boxes[:, 0]
        y1_cls = cls_boxes[:, 1]
        x2_cls = cls_boxes[:, 0] + cls_boxes[:, 2]
        y2_cls = cls_boxes[:, 1] + cls_boxes[:, 3]
        areas = (x2_cls - x1_cls) * (y2_cls - y1_cls)
        order = cls_scores.argsort()[::-1]
        keep = []
        while order.size > 0:
            i = order[0]
            keep.append(i)
            if order.size == 1:
                break
            xx1 = np.maximum(x1_cls[i], x1_cls[order[1:]])
            yy1 = np.maximum(y1_cls[i], y1_cls[order[1:]])
            xx2 = np.minimum(x2_cls[i], x2_cls[order[1:]])
            yy2 = np.minimum(y2_cls[i], y2_cls[order[1:]])
            w = np.maximum(0, xx2 - xx1)
            h = np.maximum(0, yy2 - yy1)
            intersection = w * h
            iou = intersection / (areas[i] + areas[order[1:]] - intersection)
            inds = np.where(iou <= nms_threshold)[0]
            order = order[inds + 1]
        for idx in keep:
            final_detections.append(Object(
                bbox=cls_boxes[idx].tolist(),
                label=int(cls),
                prob=float(cls_scores[idx])
            ))
    return final_detections

def main():
    parser = argparse.ArgumentParser(description="YOLO11 AXEngine Inference")
    parser.add_argument('--model', type=str, default='yolo11x.axmodel', help='Model path')
    parser.add_argument('--image', type=str, default='dog.jpg', help='Image path')
    parser.add_argument('--conf', type=float, default=0.45, help='Confidence threshold')
    parser.add_argument('--nms', type=float, default=0.45, help='NMS threshold')
    parser.add_argument('--size', type=int, nargs=2, default=[640, 640], help='Input size W H')
    parser.add_argument('--regmax', type=int, default=16, help='DFL reg_max value')
    args = parser.parse_args()

    try:
        input_tensor, original_shape, original_image = preprocess(args.image, tuple(args.size))
    except FileNotFoundError as e:
        print(e)
        return

    try:
        session = axe.InferenceSession(args.model)
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    input_name = session.get_inputs()[0].name
    output_names = [output.name for output in session.get_outputs()]

    try:
        outputs = session.run(output_names, {input_name: input_tensor})
    except Exception as e:
        print(f"Error during inference: {e}")
        return

    try:
        detections = postprocess(
            outputs,
            original_shape,
            tuple(args.size),
            args.conf,
            args.nms,
            reg_max=args.regmax
        )
    except Exception as e:
        print(f"Error during post-processing: {e}")
        return

    for det in detections:
        bbox = det.bbox
        score = det.prob
        class_id = det.label
        if class_id >= len(COCO_CLASSES):
            label = f"cls{class_id}: {score:.2f}"
        else:
            label = f"{COCO_CLASSES[class_id]}: {score:.2f}"
        x, y, w, h = map(int, bbox)
        cv2.rectangle(original_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(original_image, label, (x, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    cv2.imwrite('detections.png', cv2.cvtColor(original_image, cv2.COLOR_RGB2BGR))
    print("结果已保存到 detections.png")

if __name__ == '__main__':
    main()