Spaces:

devranx
/

Pixel-Prompt-Annotator

Running

App Files Files Community

devranx commited on 8 days ago

Commit

d9223ba

0 Parent(s):

Initial commit

Browse files

Files changed (7) hide show

.gitattributes +2 -0
.gitignore +46 -0
Colab_Runner.ipynb +83 -0
README.md +39 -0
app.py +381 -0
requirements.txt +10 -0
utils.py +420 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.jpg filter=lfs diff=lfs merge=lfs -text
2	+ *.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environments
+venv/
+.env
+.venv
+# Streamlit
+.streamlit/
+secrets.toml
+# IDEs
+.vscode/
+.idea/
+# Large Files / Data
+*.jpg
+*.jpeg
+*.png
+*.zip
+*_crops/
+annotations*.json
+# Deployment logs
+*.log

Colab_Runner.ipynb ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# 🚀 Annotation Assistant - Colab Launcher\n",
+        "**Instructions:**\n",
+        "1. Upload all project files (`app.py`, `utils.py`, `requirements.txt`) to the details file area on the left.\n",
+        "2. Add your Ngrok Authtoken below.\n",
+        "3. Run all cells."
+      ],
+      "metadata": {
+        "id": "intro_md"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# 1. Install Dependencies\n",
+        "!pip install -r requirements.txt"
+      ],
+      "metadata": {
+        "id": "install_deps"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# 2. Authenticate ngrok\n",
+        "# REPLACE 'YOUR_TOKEN' WITH YOUR ACTUAL TOKEN\n",
+        "from pyngrok import ngrok\n",
+        "ngrok.set_auth_token(\"YOUR_NGROK_AUTHTOKEN_HERE\")"
+      ],
+      "metadata": {
+        "id": "auth_ngrok"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# 3. Run the App\n",
+        "import os\n",
+        "import time\n",
+        "from pyngrok import ngrok\n",
+        "\n",
+        "# Kill previous tunnels\n",
+        "ngrok.kill()\n",
+        "\n",
+        "# Run Streamlit in background\n",
+        "get_ipython().system_raw('streamlit run app.py &')\n",
+        "\n",
+        "# Open Tunnel\n",
+        "time.sleep(5)  # Wait for start\n",
+        "public_url = ngrok.connect(8501).public_url\n",
+        "print(f\"🚀 Application Live at: {public_url}\")"
+      ],
+      "metadata": {
+        "id": "run_app"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+# ✨ Annotation Assistant
+![Demo](demo_thumb.png)
+## Overview
+Annotation Assistant is a state-of-the-art **Vision-Language Object Detection** tool. It combines the power of **Qwen-VL (4B)** with a premium, user-friendly interface to make labeled data creation effortless.
+Unlike standard detection tools, this assistant is **conversational**. You can refine detections naturally (e.g., *"Also find the cup"*), and the AI intelligently merges new findings with existing ones.
+## Key Features
+### 🧠 **Intelligent Memory & Context**
+The Assistant remembers what it has already found.
+*   **No Amnesia**: Unlike basic wrappers, this tool feeds its own previous detections back into the context.
+*   **Example**: If you say *"Find the laptop"* and then *"Find the remaining objects"*, it understands what "remaining" means because it knows the laptop is already detected.
+### 🎯 **Smart Refinement Logic**
+We implemented a custom **Weighted Merge Algorithm** to handle updates:
+*   **Refinement**: If you draw a better box for `"shirt"` over an existing one (>80% overlap), it **replaces** the old one.
+*   **distinct Objects**: If you seek a second `"shirt"` elsewhere (low overlap), it **adds** it as a new object.
+*   Result: NO duplicate ghost boxes, NO accidental deletions.
+### 👁️ **Explainable AI (Reasoning)**
+Don't just trust the box. The Assistant provides a **Reasoning Stream** explaining *why* it detected an object.
+*   *Example*: "Detected silver laptop due to distinct Apple logo and metallic finish."
+### 🎨 **Premium "Hero" Interface**
+*   **Single-Column Layout**: Your image takes center stage.
+*   **Dynamic Resizing**: Use the slider to scale the view from 300px to 1500px without losing layout structure.
+*   **Visuals**: Deep Space gradient theme, glassmorphism metrics, and auto-centering.
+## Quick Start
+1.  **Upload**: Drag & Drop your image into the central hub.
+2.  **Prompt**: Type what you're looking for (e.g., *"Find all branded items"*).
+3.  **Refine**: Chat with the AI to fix mistakes or add more items.
+4.  **Download**: Export your data as **COCO JSON** or download a **ZIP of cropped images**.
+---
+*Built with Streamlit, Qwen-VL, and ❤️.*

app.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import streamlit as st
+import time
+import utils
+from PIL import Image
+import numpy as np
+import uuid
+# Set page config
+st.set_page_config(page_title="Annotation Assistant", layout="wide", page_icon="✨")
+# --- Premium Custom CSS ---
+st.markdown("""
+<style>
+    @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600&display=swap');
+    /* Global Theme */
+    html, body, [class*="css"] {
+        font-family: 'Outfit', sans-serif;
+    }
+    /* Background Gradient - "Deep Space" Theme */
+    .stApp {
+        background: radial-gradient(circle at top left, #1a202c, #0d1117);
+    }
+    /* Sidebar Styling */
+    section[data-testid="stSidebar"] {
+        background-color: #111827;
+        border-right: 1px solid #1F2937;
+    }
+    /* Hide Header and Default Elements */
+    header {visibility: hidden;}
+    .block-container {
+        padding-top: 1rem;
+        padding-bottom: 5rem;
+        max_width: 1000px;
+    }
+    /* Headers */
+    h1 {
+        background: -webkit-linear-gradient(45deg, #60A5FA, #34D399);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        font-weight: 600;
+        letter-spacing: -0.02em;
+    }
+    /* Dotted Upload Box */
+    [data-testid='stFileUploader'] section {
+        border: 1px dashed #4A5568;
+        background-color: rgba(255, 255, 255, 0.02);
+        border-radius: 16px;
+        padding: 4rem 2rem;
+        min-height: 300px;
+        align-items: center;
+        justify-content: center;
+        transition: all 0.3s ease;
+    }
+    [data-testid='stFileUploader'] section:hover {
+        background-color: rgba(255, 255, 255, 0.05);
+        border-color: #60A5FA;
+        cursor: pointer;
+        box-shadow: 0 0 25px rgba(96, 165, 250, 0.15);
+        transform: scale(1.01);
+    }
+    /* Buttons - "Glass" Style */
+    .stButton > button {
+        border: 1px solid rgba(255,255,255,0.1);
+        border-radius: 8px;
+        background: rgba(255,255,255,0.05);
+        color: #E2E8F0;
+        font-weight: 500;
+        backdrop-filter: blur(5px);
+        transition: all 0.2s ease;
+    }
+    .stButton > button:hover {
+        background: rgba(255,255,255,0.1);
+        border-color: #60A5FA;
+        color: #FFFFFF;
+        box-shadow: 0 4px 12px rgba(0,0,0,0.2);
+    }
+    /* Secondary/Reset Button */
+    button[kind="secondary"] {
+        color: #F87171 !important;
+        border-color: rgba(248, 113, 113, 0.2) !important;
+    }
+    button[kind="secondary"]:hover {
+        background: rgba(248, 113, 113, 0.1) !important;
+        border-color: #F87171 !important;
+        box-shadow: 0 0 10px rgba(248, 113, 113, 0.2);
+    }
+    /* Session Buttons in Sidebar */
+    .session-btn {
+        width: 100%;
+        text-align: left;
+        margin-bottom: 5px;
+    }
+    /* Metrics Bar - Floating "Pill" */
+    .metric-pill {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 12px;
+        background: rgba(16, 24, 39, 0.8);
+        border: 1px solid #2D3748;
+        padding: 10px 24px;
+        border-radius: 100px;
+        margin: 20px auto; /* Centered */
+        width: fit-content;
+        font-size: 0.9rem;
+        color: #94A3B8;
+        box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3);
+    }
+    .metric-value {
+        color: #34D399;
+        font-family: 'JetBrains Mono', monospace;
+        font-weight: 600;
+    }
+    /* Reasoning Cards - Centered & Wide */
+    .reasoning-container {
+        margin-top: 20px;
+        background: rgba(30, 41, 59, 0.3);
+        border-radius: 12px;
+        padding: 15px;
+        border: 1px solid rgba(255,255,255,0.05);
+    }
+    .reasoning-card {
+        background: rgba(255,255,255,0.02);
+        border-left: 3px solid #3B82F6;
+        padding: 12px 16px;
+        margin-bottom: 10px;
+        border-radius: 0 8px 8px 0;
+    }
+    .reasoning-label {
+        font-weight: 600;
+        color: #E2E8F0;
+        font-size: 0.95rem;
+        margin-bottom: 4px;
+    }
+    .reasoning-text {
+        font-size: 0.85rem;
+        color: #94A3B8;
+        line-height: 1.5;
+    }
+    /* Input Area */
+    .stChatInputContainer {
+        padding-bottom: 2rem;
+    }
+    /* Slider Customization */
+    div[data-testid="stSlider"] > div {
+        max_width: 300px;
+        margin: auto;
+    }
+    /* CENTER IMAGES */
+    div[data-testid="stImage"] {
+        display: flex;
+        justify-content: center;
+        width: 100%;
+    }
+    div[data-testid="stImage"] > img {
+        margin: 0 auto;
+    }
+</style>
+""", unsafe_allow_html=True)
+# --- State Management ---
+if "model_loaded" not in st.session_state:
+    st.session_state.model_loaded = False
+if "sessions" not in st.session_state:
+    # Structure: { session_id: { name, history, detections, image, metrics, timestamp } }
+    st.session_state.sessions = {}
+if "active_session_id" not in st.session_state:
+    st.session_state.active_session_id = None
+# Helper 1: Create a new session
+def create_session(name="New Chat"):
+    session_id = str(uuid.uuid4())
+    st.session_state.sessions[session_id] = {
+        "name": name,
+        "history": [],
+        "detections": [],
+        "image": None,
+        "metrics": {},
+        "created_at": time.time()
+    }
+    st.session_state.active_session_id = session_id
+    return session_id
+# Helper 2: Get active session data
+def get_active_session():
+    if not st.session_state.active_session_id:
+        create_session()
+    return st.session_state.sessions[st.session_state.active_session_id]
+# Ensure at least one session exists
+if not st.session_state.sessions:
+    create_session()
+current_session = get_active_session()
+# --- Sidebar (Session Manager) ---
+with st.sidebar:
+    st.markdown("### 🗂️ Sessions")
+    if st.button("➕ New Chat", use_container_width=True, type="primary"):
+        create_session()
+        st.rerun()
+    st.markdown("---")
+    # Sort sessions by recency
+    sorted_sessions = sorted(
+        st.session_state.sessions.items(),
+        key=lambda x: x[1]['created_at'],
+        reverse=True
+    )
+    for s_id, s_data in sorted_sessions:
+        # Hide empty "New Chat" sessions from the list unless active
+        if s_data['image'] is None:
+            continue
+        is_active = (s_id == st.session_state.active_session_id)
+        display_name = s_data['name']
+        icon = "📂" if is_active else "📝"
+        label = f"{icon} {display_name}"
+        if st.button(label, key=f"sess_{s_id}", use_container_width=True, type="secondary" if not is_active else "primary"):
+            st.session_state.active_session_id = s_id
+            st.rerun()
+# --- Model Loading ---
+if not st.session_state.model_loaded:
+    with st.spinner("Initializing AI Core..."):
+        processor, model = utils.load_model()
+        if processor and model:
+            st.session_state.model_loaded = True
+            st.session_state.processor = processor
+            st.session_state.model = model
+            st.rerun()
+        else:
+            st.error("Model Engine Failure.")
+            st.stop()
+# --- Main Workspace ---
+# Header
+col_logo, col_space = st.columns([6, 1])
+with col_logo:
+    if current_session['name'] == "New Chat":
+        st.markdown("# Annotation Assistant")
+    else:
+        st.markdown(f"# {current_session['name']}")
+# Logic
+if current_session['image'] is None:
+    # --- Upload State ---
+    st.markdown(
+        "<h3 style='text-align: center; color: #94A3B8; border: none;'>Upload an image to start this session</h3>",
+        unsafe_allow_html=True
+    )
+    uploaded_file = st.file_uploader(
+        "Upload Image",
+        type=["jpg", "png", "jpeg"],
+        key=f"uploader_{st.session_state.active_session_id}",
+        label_visibility="collapsed"
+    )
+    if uploaded_file:
+        image = Image.open(uploaded_file).convert("RGB")
+        current_session['image'] = image
+        current_session['name'] = uploaded_file.name
+        st.rerun()
+else:
+    # --- Analysis State ---
+    # Image Controls
+    img_width = st.slider("Adjust View Size", 300, 1500, 700, 50, help="Drag to resize the image view")
+    st.markdown("<br>", unsafe_allow_html=True)
+    # 1. Main visual (Hero)
+    display_image = current_session['image'].copy()
+    if current_session['detections']:
+        display_image = utils.draw_boxes(display_image, current_session['detections'])
+    st.image(display_image, width=img_width)
+    # 2. Results Actions & Metrics
+    if current_session['detections']:
+        # Metrics Row
+        if current_session['metrics']:
+            m = current_session['metrics']
+            st.markdown(f"""
+            <div class='metric-pill'>
+                <span>Inference <span class='metric-value'>{m.get('inference_time', 0)}s</span></span>
+                <span style='color: #4B5563'>|</span>
+                <span>Total <span class='metric-value'>{m.get('total_time', 0)}s</span></span>
+                <span style='color: #4B5563'>|</span>
+                <span>Tokens <span class='metric-value'>{m.get('token_count', 0)}</span></span>
+            </div>
+            """, unsafe_allow_html=True)
+        # Download Row
+        c1, c2, c3 = st.columns([1, 1, 3]) # Bias to left
+        with c1:
+            # UPDATED: Pass usage metadata for Strict COCO compatibility
+            coco_json = utils.convert_to_coco(
+                current_session['detections'],
+                image_size=current_session['image'].size,
+                filename=current_session['name']
+            )
+            st.download_button("Download JSON", coco_json, "annotations.json", "application/json", use_container_width=True)
+        with c2:
+            zip_buffer = utils.create_crops_zip(current_session['image'], current_session['detections'])
+            st.download_button("Download ZIP", zip_buffer, "crops.zip", "application/zip", use_container_width=True)
+        # 3. Reasoning Stream (Below)
+        st.markdown("<div style='height: 20px;'></div>", unsafe_allow_html=True)
+        st.markdown("### AI Insights")
+        with st.container():
+            st.markdown("<div class='reasoning-container'>", unsafe_allow_html=True)
+            for det in current_session['detections'][::-1]:
+                    label = det.get('label', 'Object')
+                    reasoning = det.get('reasoning', None)
+                    if not reasoning: reasoning = "Object detected based on visual features."
+                    st.markdown(f"""
+                    <div class='reasoning-card'>
+                        <div class='reasoning-label'>{label}</div>
+                        <div class='reasoning-text'>{reasoning}</div>
+                    </div>
+                    """, unsafe_allow_html=True)
+            st.markdown("</div>", unsafe_allow_html=True)
+    else:
+        # Image loaded but no detections
+        st.markdown(
+            "<div style='text-align: center; margin-top: 20px; color: #64748B; font-style: italic;'>"
+            "Waiting for instructions... Use the chat bar below."
+            "</div>",
+            unsafe_allow_html=True
+        )
+# --- Floating Chat Bar ---
+st.markdown("<br>", unsafe_allow_html=True)
+prompt = st.chat_input("Describe objects to detect...")
+if prompt:
+    if current_session['image'] is None:
+        st.error("Please upload an image first.")
+    else:
+        with st.status("Analyzing Scene...", expanded=True) as status:
+            detections, updated_history, raw_text, metrics = utils.get_bounding_boxes(
+                current_session['image'],
+                prompt,
+                current_session['history'],
+                st.session_state.processor,
+                st.session_state.model
+            )
+            if detections:
+                current_session['detections'] = utils.smart_merge_detections(current_session['detections'], detections)
+                current_session['history'] = updated_history
+                current_session['metrics'] = metrics
+                status.update(label="Complete", state="complete", expanded=False)
+                st.rerun()
+            else:
+                status.update(label="No matches found.", state="error", expanded=False)
+                st.toast(f"No match found.", icon="⚠️")

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit
+transformers
+torch
+accelerate
+pillow
+opencv-python-headless
+pyngrok
+numpy
+qwen_vl_utils
+einops

utils.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import os
+import torch
+import numpy as np
+import json
+import time
+import io
+import zipfile
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoProcessor, AutoModelForVision2Seq
+import streamlit as st
+import re
+# Constants
+MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
+@st.cache_resource
+def load_model():
+    """
+    Loads the Qwen-VL model and processor.
+    """
+    print(f"Loading model: {MODEL_ID}...")
+    try:
+        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        model = AutoModelForVision2Seq.from_pretrained(
+            MODEL_ID,
+            device_map="auto",
+            trust_remote_code=True,
+            torch_dtype=torch.float16
+        )
+    except Exception as e:
+        print(f"Error loading {MODEL_ID}: {e}")
+        st.error(f"Could not load model {MODEL_ID}. Error: {e}")
+        return None, None
+    return processor, model
+def get_bounding_boxes(image: Image.Image, prompt: str, history: list, processor, model):
+    """
+    Generates bounding boxes based on the image, prompt, and conversation history.
+    """
+    start_time = time.time()
+    if model is None or processor is None:
+        return [], history, "Model not loaded.", {}
+    # Construct conversation
+    messages = []
+    # Context
+    context_text = ""
+    if history:
+        context_text = "History:\n"
+        for msg in history:
+            role = "User" if msg['role'] == 'user' else "Assistant"
+            context_text += f"{role}: {msg['content']}\n"
+        context_text += "\n"
+    # Enhanced Prompt: JSON Focused With Reasoning
+    final_prompt = f"{context_text}User Request: {prompt}\n\nTask: Detect objects mentioned in the User Request.\nConstraint: Return the result ONLY as a JSON object with a key 'objects'.\nEach object in the list should have 'label', 'bbox' [x1, y1, x2, y2] (common normalized coordinates 0-1000), AND 'reasoning' (a brief string explaining why this object matches).\nExample: {{'objects': [{{'label': 'cat', 'bbox': [100, 200, 500, 600], 'reasoning': 'Detected distinct feline features and whiskers.'}}]}}\nIf no objects are found, return {{'objects': []}}."
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a precise object detection assistant. Return JSON with 'objects' list containing 'label', 'bbox' [x1, y1, x2, y2] (common normalized coordinates 0-1000), and 'reasoning'."
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": final_prompt}
+            ]
+        }
+    ]
+    # Process inputs
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    try:
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.device)
+        # Generate (Measured)
+        generate_start = time.time()
+        generated_ids = model.generate(**inputs, max_new_tokens=512)
+        generate_end = time.time()
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+    except Exception as e:
+        print(f"Inference Error: {e}")
+        output_text = f"Error: {e}"
+        generate_end = time.time()
+    # Update history
+    history.append({"role": "user", "content": prompt})
+    history.append({"role": "assistant", "content": output_text})
+    # Parse detections
+    detections = parse_qwen_output(output_text, image.width, image.height)
+    # Filter
+    filtered_detections = []
+    total_area = image.width * image.height
+    for det in detections:
+        x1, y1, x2, y2 = det['box']
+        box_area = (x2 - x1) * (y2 - y1)
+        coverage = box_area / total_area
+        is_suspicious_coverage = coverage > 0.95
+        is_whole_request = any(w in prompt.lower() for w in ["image", "picture", "photo", "background", "everything"])
+        if is_suspicious_coverage and not is_whole_request:
+            continue
+        filtered_detections.append(det)
+    # Metrics
+    end_time = time.time()
+    total_time = end_time - start_time
+    inference_time = generate_end - generate_start
+    metrics = {
+        "total_time": round(total_time, 2),
+        "inference_time": round(inference_time, 2),
+        "token_count": len(generated_ids[0]) if 'generated_ids' in locals() else 0
+    }
+    return filtered_detections, history, output_text, metrics
+def smart_merge_detections(existing_detections, new_detections):
+    """
+    Merges new detections with existing ones.
+    Strategy: SIMPLE OVERLAP ONLY.
+    If IoU > 0.8 -> Assume duplicate/refinement -> Replace.
+    Else -> Keep.
+    """
+    merged_list = existing_detections.copy()
+    for new_det in new_detections:
+        new_box = new_det['box']
+        indices_to_remove = []
+        for i, old_det in enumerate(merged_list):
+            old_box = old_det['box']
+            iou = calculate_iou(new_box, old_box)
+            # Simple threshold check
+            if iou > 0.8:
+                indices_to_remove.append(i)
+        for idx in sorted(indices_to_remove, reverse=True):
+            merged_list.pop(idx)
+        merged_list.append(new_det)
+    return merged_list
+def calculate_iou(boxA, boxB):
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[2], boxB[2])
+    yB = min(boxA[3], boxB[3])
+    interArea = max(0, xB - xA) * max(0, yB - yA)
+    boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
+    boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
+    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
+    return iou
+def parse_qwen_output(text, width, height):
+    """
+    Parses Qwen-VL output, prioritizing JSON with reasoning.
+    """
+    detections = []
+    # 1. Try JSON Parsing (Primary Strategy)
+    try:
+        match = re.search(r'\{.*\}', text, re.DOTALL)
+        if match:
+            json_str = match.group()
+            data = json.loads(json_str)
+            if 'objects' in data and isinstance(data['objects'], list):
+                for obj in data['objects']:
+                    x1, y1, x2, y2 = obj['bbox']
+                    label = obj.get('label', 'Object')
+                    reasoning = obj.get('reasoning', 'No reasoning provided')
+                    real_x1 = (x1 / 1000) * width
+                    real_y1 = (y1 / 1000) * height
+                    real_x2 = (x2 / 1000) * width
+                    real_y2 = (y2 / 1000) * height
+                    detections.append({
+                        "label": label,
+                        "box": [real_x1, real_y1, real_x2, real_y2],
+                        "score": 1.0,
+                        "reasoning": reasoning
+                    })
+    except Exception as e:
+        print(f"JSON Parse Error: {e}")
+        pass
+    # 2. Fallback to Standard Tags
+    if not detections:
+        pattern_standard = r"<\|box_start\|>(\d+),(\d+),(\d+),(\d+)<\|box_end\|>(?:<\|object_start\|>(.*?)<\|object_end\|>)?"
+        matches_standard = list(re.finditer(pattern_standard, text))
+        for match in matches_standard:
+            c1, c2, c3, c4 = map(int, match.groups()[:4])
+            label = match.group(5) if match.group(5) else "Object"
+            y1 = (c1 / 1000) * height
+            x1 = (c2 / 1000) * width
+            y2 = (c3 / 1000) * height
+            x2 = (c4 / 1000) * width
+            detections.append({
+                "label": label,
+                "box": [x1, y1, x2, y2],
+                "score": 1.0,
+                "reasoning": "Legacy detection mode"
+            })
+    return detections
+def create_crops_zip(image: Image.Image, detections: list):
+    """
+    Creates a ZIP file containing cropped images of all detections.
+    """
+    zip_buffer = io.BytesIO()
+    # Ensure distinct filenames
+    counts = {}
+    with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
+        for i, det in enumerate(detections):
+            label = det.get('label', 'object').replace(" ", "_").lower()
+            if label not in counts:
+                counts[label] = 1
+            else:
+                counts[label] += 1
+                label = f"{label}_{counts[label]}"
+            x1, y1, x2, y2 = map(int, det['box'])
+            x1 = max(0, x1)
+            y1 = max(0, y1)
+            x2 = min(image.width, x2)
+            y2 = min(image.height, y2)
+            if x2 > x1 and y2 > y1:
+                crop = image.crop((x1, y1, x2, y2))
+                crop_buffer = io.BytesIO()
+                crop.save(crop_buffer, format="JPEG")
+                zip_file.writestr(f"{label}.jpg", crop_buffer.getvalue())
+    zip_buffer.seek(0)
+    return zip_buffer
+def process_vision_info(messages):
+    try:
+        from qwen_vl_utils import process_vision_info
+        return process_vision_info(messages)
+    except ImportError:
+        images = []
+        for msg in messages:
+            for item in msg["content"]:
+                if item["type"] == "image":
+                    images.append(item["image"])
+        return images, None
+def draw_boxes(image: Image.Image, detections: list):
+    """
+    Draws bounding boxes with dynamic font scaling.
+    """
+    draw = ImageDraw.Draw(image)
+    # Dynamic Scaling (UPDATED FOR BETTER VISIBILITY)
+    min_dim = min(image.width, image.height)
+    scaled_font_size = max(20, int(min_dim * 0.035))
+    scaled_line_width = max(4, int(min_dim * 0.006))
+    font = None
+    try:
+        font_names = ["arial.ttf", "LiberationSans-Regular.ttf", "DejaVuSans.ttf"]
+        for fn in font_names:
+            try:
+                font = ImageFont.truetype(fn, scaled_font_size)
+                break
+            except:
+                continue
+    except:
+        pass
+    if font is None:
+        try:
+            font = ImageFont.load_default()
+        except:
+             pass
+    palette = [
+        "#FF00FF", "#00FFFF", "#FF0000", "#00FF00",
+        "#FFFF00", "#FFA500", "#800080", "#008080"
+    ]
+    def get_color(text):
+        if not text: return palette[0]
+        idx = sum(ord(c) for c in text) % len(palette)
+        return palette[idx]
+    for det in detections:
+        box = det['box']
+        label = det.get('label', 'Object')
+        score_val = det.get('score', 1.0)
+        display_text = f"{label} {int(score_val*100)}%"
+        color = get_color(label)
+        x1, y1, x2, y2 = box
+        draw.rectangle([x1, y1, x2, y2], outline=color, width=scaled_line_width)
+        # Text box
+        if font:
+            text_bbox = draw.textbbox((x1, y1), display_text, font=font)
+            text_width = text_bbox[2] - text_bbox[0]
+            text_height = text_bbox[3] - text_bbox[1]
+            label_y = y1 - text_height - (scaled_line_width * 2)
+            if label_y < 0: label_y = y1
+            draw.rectangle(
+                [x1, label_y, x1 + text_width + (scaled_line_width * 4), label_y + text_height + (scaled_line_width * 2)],
+                fill=color
+            )
+            draw.text((x1 + (scaled_line_width), label_y), display_text, fill="black", font=font)
+    return image
+def convert_to_coco(detections, image_size=(1000, 1000), filename="image.jpg"):
+    """
+    Converts detections to full Standard COCO JSON format.
+    """
+    width, height = image_size
+    # 1. Info
+    info = {
+        "year": 2025,
+        "version": "1.0",
+        "description": "Generated by Annotation Assistant (Qwen-VL)",
+        "date_created": time.strftime("%Y-%m-%d")
+    }
+    # 2. Images
+    images = [{
+        "id": 1,
+        "width": width,
+        "height": height,
+        "file_name": filename,
+        "license": 0,
+        "flickr_url": "",
+        "coco_url": "",
+        "date_captured": 0
+    }]
+    # 3. Categories & Annotations
+    categories = []
+    category_map = {}
+    annotations = []
+    cat_id_counter = 1
+    for i, det in enumerate(detections):
+        label = det.get('label', 'object')
+        # Manage Categories
+        if label not in category_map:
+            category_map[label] = cat_id_counter
+            categories.append({
+                "id": cat_id_counter,
+                "name": label,
+                "supercategory": "object"
+            })
+            cat_id_counter += 1
+        x1, y1, x2, y2 = det['box']
+        w = x2 - x1
+        h = y2 - y1
+        ann = {
+            "id": i + 1,
+            "image_id": 1,
+            "category_id": category_map[label],
+            "bbox": [round(x1, 2), round(y1, 2), round(w, 2), round(h, 2)],
+            "area": round(w * h, 2),
+            "iscrowd": 0,
+            "attributes": {
+                "reasoning": det.get('reasoning', '')
+            }
+        }
+        annotations.append(ann)
+    coco_output = {
+        "info": info,
+        "images": images,
+        "annotations": annotations,
+        "categories": categories,
+        "licenses": []
+    }
+    return json.dumps(coco_output, indent=2)