devranx commited on
Commit
d9223ba
·
0 Parent(s):

Initial commit

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. .gitignore +46 -0
  3. Colab_Runner.ipynb +83 -0
  4. README.md +39 -0
  5. app.py +381 -0
  6. requirements.txt +10 -0
  7. utils.py +420 -0
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.jpg filter=lfs diff=lfs merge=lfs -text
2
+ *.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+
24
+ # Virtual Environments
25
+ venv/
26
+ .env
27
+ .venv
28
+
29
+ # Streamlit
30
+ .streamlit/
31
+ secrets.toml
32
+
33
+ # IDEs
34
+ .vscode/
35
+ .idea/
36
+
37
+ # Large Files / Data
38
+ *.jpg
39
+ *.jpeg
40
+ *.png
41
+ *.zip
42
+ *_crops/
43
+ annotations*.json
44
+
45
+ # Deployment logs
46
+ *.log
Colab_Runner.ipynb ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ }
16
+ },
17
+ "cells": [
18
+ {
19
+ "cell_type": "markdown",
20
+ "source": [
21
+ "# 🚀 Annotation Assistant - Colab Launcher\n",
22
+ "**Instructions:**\n",
23
+ "1. Upload all project files (`app.py`, `utils.py`, `requirements.txt`) to the details file area on the left.\n",
24
+ "2. Add your Ngrok Authtoken below.\n",
25
+ "3. Run all cells."
26
+ ],
27
+ "metadata": {
28
+ "id": "intro_md"
29
+ }
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "source": [
34
+ "# 1. Install Dependencies\n",
35
+ "!pip install -r requirements.txt"
36
+ ],
37
+ "metadata": {
38
+ "id": "install_deps"
39
+ },
40
+ "execution_count": null,
41
+ "outputs": []
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "source": [
46
+ "# 2. Authenticate ngrok\n",
47
+ "# REPLACE 'YOUR_TOKEN' WITH YOUR ACTUAL TOKEN\n",
48
+ "from pyngrok import ngrok\n",
49
+ "ngrok.set_auth_token(\"YOUR_NGROK_AUTHTOKEN_HERE\")"
50
+ ],
51
+ "metadata": {
52
+ "id": "auth_ngrok"
53
+ },
54
+ "execution_count": null,
55
+ "outputs": []
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "source": [
60
+ "# 3. Run the App\n",
61
+ "import os\n",
62
+ "import time\n",
63
+ "from pyngrok import ngrok\n",
64
+ "\n",
65
+ "# Kill previous tunnels\n",
66
+ "ngrok.kill()\n",
67
+ "\n",
68
+ "# Run Streamlit in background\n",
69
+ "get_ipython().system_raw('streamlit run app.py &')\n",
70
+ "\n",
71
+ "# Open Tunnel\n",
72
+ "time.sleep(5) # Wait for start\n",
73
+ "public_url = ngrok.connect(8501).public_url\n",
74
+ "print(f\"🚀 Application Live at: {public_url}\")"
75
+ ],
76
+ "metadata": {
77
+ "id": "run_app"
78
+ },
79
+ "execution_count": null,
80
+ "outputs": []
81
+ }
82
+ ]
83
+ }
README.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ✨ Annotation Assistant
2
+
3
+ ![Demo](demo_thumb.png)
4
+
5
+ ## Overview
6
+ Annotation Assistant is a state-of-the-art **Vision-Language Object Detection** tool. It combines the power of **Qwen-VL (4B)** with a premium, user-friendly interface to make labeled data creation effortless.
7
+
8
+ Unlike standard detection tools, this assistant is **conversational**. You can refine detections naturally (e.g., *"Also find the cup"*), and the AI intelligently merges new findings with existing ones.
9
+
10
+ ## Key Features
11
+
12
+ ### 🧠 **Intelligent Memory & Context**
13
+ The Assistant remembers what it has already found.
14
+ * **No Amnesia**: Unlike basic wrappers, this tool feeds its own previous detections back into the context.
15
+ * **Example**: If you say *"Find the laptop"* and then *"Find the remaining objects"*, it understands what "remaining" means because it knows the laptop is already detected.
16
+
17
+ ### 🎯 **Smart Refinement Logic**
18
+ We implemented a custom **Weighted Merge Algorithm** to handle updates:
19
+ * **Refinement**: If you draw a better box for `"shirt"` over an existing one (>80% overlap), it **replaces** the old one.
20
+ * **distinct Objects**: If you seek a second `"shirt"` elsewhere (low overlap), it **adds** it as a new object.
21
+ * Result: NO duplicate ghost boxes, NO accidental deletions.
22
+
23
+ ### 👁️ **Explainable AI (Reasoning)**
24
+ Don't just trust the box. The Assistant provides a **Reasoning Stream** explaining *why* it detected an object.
25
+ * *Example*: "Detected silver laptop due to distinct Apple logo and metallic finish."
26
+
27
+ ### 🎨 **Premium "Hero" Interface**
28
+ * **Single-Column Layout**: Your image takes center stage.
29
+ * **Dynamic Resizing**: Use the slider to scale the view from 300px to 1500px without losing layout structure.
30
+ * **Visuals**: Deep Space gradient theme, glassmorphism metrics, and auto-centering.
31
+
32
+ ## Quick Start
33
+ 1. **Upload**: Drag & Drop your image into the central hub.
34
+ 2. **Prompt**: Type what you're looking for (e.g., *"Find all branded items"*).
35
+ 3. **Refine**: Chat with the AI to fix mistakes or add more items.
36
+ 4. **Download**: Export your data as **COCO JSON** or download a **ZIP of cropped images**.
37
+
38
+ ---
39
+ *Built with Streamlit, Qwen-VL, and ❤️.*
app.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import time
3
+ import utils
4
+ from PIL import Image
5
+ import numpy as np
6
+ import uuid
7
+
8
+ # Set page config
9
+ st.set_page_config(page_title="Annotation Assistant", layout="wide", page_icon="✨")
10
+
11
+ # --- Premium Custom CSS ---
12
+ st.markdown("""
13
+ <style>
14
+ @import url('https://fonts.googleapis.com/css2?family=Outfit:wght@300;400;600&display=swap');
15
+
16
+ /* Global Theme */
17
+ html, body, [class*="css"] {
18
+ font-family: 'Outfit', sans-serif;
19
+ }
20
+
21
+ /* Background Gradient - "Deep Space" Theme */
22
+ .stApp {
23
+ background: radial-gradient(circle at top left, #1a202c, #0d1117);
24
+ }
25
+
26
+ /* Sidebar Styling */
27
+ section[data-testid="stSidebar"] {
28
+ background-color: #111827;
29
+ border-right: 1px solid #1F2937;
30
+ }
31
+
32
+ /* Hide Header and Default Elements */
33
+ header {visibility: hidden;}
34
+ .block-container {
35
+ padding-top: 1rem;
36
+ padding-bottom: 5rem;
37
+ max_width: 1000px;
38
+ }
39
+
40
+ /* Headers */
41
+ h1 {
42
+ background: -webkit-linear-gradient(45deg, #60A5FA, #34D399);
43
+ -webkit-background-clip: text;
44
+ -webkit-text-fill-color: transparent;
45
+ font-weight: 600;
46
+ letter-spacing: -0.02em;
47
+ }
48
+
49
+ /* Dotted Upload Box */
50
+ [data-testid='stFileUploader'] section {
51
+ border: 1px dashed #4A5568;
52
+ background-color: rgba(255, 255, 255, 0.02);
53
+ border-radius: 16px;
54
+ padding: 4rem 2rem;
55
+ min-height: 300px;
56
+ align-items: center;
57
+ justify-content: center;
58
+ transition: all 0.3s ease;
59
+ }
60
+ [data-testid='stFileUploader'] section:hover {
61
+ background-color: rgba(255, 255, 255, 0.05);
62
+ border-color: #60A5FA;
63
+ cursor: pointer;
64
+ box-shadow: 0 0 25px rgba(96, 165, 250, 0.15);
65
+ transform: scale(1.01);
66
+ }
67
+
68
+ /* Buttons - "Glass" Style */
69
+ .stButton > button {
70
+ border: 1px solid rgba(255,255,255,0.1);
71
+ border-radius: 8px;
72
+ background: rgba(255,255,255,0.05);
73
+ color: #E2E8F0;
74
+ font-weight: 500;
75
+ backdrop-filter: blur(5px);
76
+ transition: all 0.2s ease;
77
+ }
78
+ .stButton > button:hover {
79
+ background: rgba(255,255,255,0.1);
80
+ border-color: #60A5FA;
81
+ color: #FFFFFF;
82
+ box-shadow: 0 4px 12px rgba(0,0,0,0.2);
83
+ }
84
+
85
+ /* Secondary/Reset Button */
86
+ button[kind="secondary"] {
87
+ color: #F87171 !important;
88
+ border-color: rgba(248, 113, 113, 0.2) !important;
89
+ }
90
+ button[kind="secondary"]:hover {
91
+ background: rgba(248, 113, 113, 0.1) !important;
92
+ border-color: #F87171 !important;
93
+ box-shadow: 0 0 10px rgba(248, 113, 113, 0.2);
94
+ }
95
+
96
+ /* Session Buttons in Sidebar */
97
+ .session-btn {
98
+ width: 100%;
99
+ text-align: left;
100
+ margin-bottom: 5px;
101
+ }
102
+
103
+ /* Metrics Bar - Floating "Pill" */
104
+ .metric-pill {
105
+ display: flex;
106
+ align-items: center;
107
+ justify-content: center;
108
+ gap: 12px;
109
+ background: rgba(16, 24, 39, 0.8);
110
+ border: 1px solid #2D3748;
111
+ padding: 10px 24px;
112
+ border-radius: 100px;
113
+ margin: 20px auto; /* Centered */
114
+ width: fit-content;
115
+ font-size: 0.9rem;
116
+ color: #94A3B8;
117
+ box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3);
118
+ }
119
+ .metric-value {
120
+ color: #34D399;
121
+ font-family: 'JetBrains Mono', monospace;
122
+ font-weight: 600;
123
+ }
124
+
125
+ /* Reasoning Cards - Centered & Wide */
126
+ .reasoning-container {
127
+ margin-top: 20px;
128
+ background: rgba(30, 41, 59, 0.3);
129
+ border-radius: 12px;
130
+ padding: 15px;
131
+ border: 1px solid rgba(255,255,255,0.05);
132
+ }
133
+ .reasoning-card {
134
+ background: rgba(255,255,255,0.02);
135
+ border-left: 3px solid #3B82F6;
136
+ padding: 12px 16px;
137
+ margin-bottom: 10px;
138
+ border-radius: 0 8px 8px 0;
139
+ }
140
+ .reasoning-label {
141
+ font-weight: 600;
142
+ color: #E2E8F0;
143
+ font-size: 0.95rem;
144
+ margin-bottom: 4px;
145
+ }
146
+ .reasoning-text {
147
+ font-size: 0.85rem;
148
+ color: #94A3B8;
149
+ line-height: 1.5;
150
+ }
151
+
152
+ /* Input Area */
153
+ .stChatInputContainer {
154
+ padding-bottom: 2rem;
155
+ }
156
+
157
+ /* Slider Customization */
158
+ div[data-testid="stSlider"] > div {
159
+ max_width: 300px;
160
+ margin: auto;
161
+ }
162
+
163
+ /* CENTER IMAGES */
164
+ div[data-testid="stImage"] {
165
+ display: flex;
166
+ justify-content: center;
167
+ width: 100%;
168
+ }
169
+ div[data-testid="stImage"] > img {
170
+ margin: 0 auto;
171
+ }
172
+ </style>
173
+ """, unsafe_allow_html=True)
174
+
175
+ # --- State Management ---
176
+ if "model_loaded" not in st.session_state:
177
+ st.session_state.model_loaded = False
178
+ if "sessions" not in st.session_state:
179
+ # Structure: { session_id: { name, history, detections, image, metrics, timestamp } }
180
+ st.session_state.sessions = {}
181
+ if "active_session_id" not in st.session_state:
182
+ st.session_state.active_session_id = None
183
+
184
+ # Helper 1: Create a new session
185
+ def create_session(name="New Chat"):
186
+ session_id = str(uuid.uuid4())
187
+ st.session_state.sessions[session_id] = {
188
+ "name": name,
189
+ "history": [],
190
+ "detections": [],
191
+ "image": None,
192
+ "metrics": {},
193
+ "created_at": time.time()
194
+ }
195
+ st.session_state.active_session_id = session_id
196
+ return session_id
197
+
198
+ # Helper 2: Get active session data
199
+ def get_active_session():
200
+ if not st.session_state.active_session_id:
201
+ create_session()
202
+ return st.session_state.sessions[st.session_state.active_session_id]
203
+
204
+ # Ensure at least one session exists
205
+ if not st.session_state.sessions:
206
+ create_session()
207
+
208
+ current_session = get_active_session()
209
+
210
+ # --- Sidebar (Session Manager) ---
211
+ with st.sidebar:
212
+ st.markdown("### 🗂️ Sessions")
213
+
214
+ if st.button("➕ New Chat", use_container_width=True, type="primary"):
215
+ create_session()
216
+ st.rerun()
217
+
218
+ st.markdown("---")
219
+
220
+ # Sort sessions by recency
221
+ sorted_sessions = sorted(
222
+ st.session_state.sessions.items(),
223
+ key=lambda x: x[1]['created_at'],
224
+ reverse=True
225
+ )
226
+
227
+ for s_id, s_data in sorted_sessions:
228
+ # Hide empty "New Chat" sessions from the list unless active
229
+ if s_data['image'] is None:
230
+ continue
231
+
232
+ is_active = (s_id == st.session_state.active_session_id)
233
+
234
+ display_name = s_data['name']
235
+ icon = "📂" if is_active else "📝"
236
+ label = f"{icon} {display_name}"
237
+
238
+ if st.button(label, key=f"sess_{s_id}", use_container_width=True, type="secondary" if not is_active else "primary"):
239
+ st.session_state.active_session_id = s_id
240
+ st.rerun()
241
+
242
+ # --- Model Loading ---
243
+ if not st.session_state.model_loaded:
244
+ with st.spinner("Initializing AI Core..."):
245
+ processor, model = utils.load_model()
246
+ if processor and model:
247
+ st.session_state.model_loaded = True
248
+ st.session_state.processor = processor
249
+ st.session_state.model = model
250
+ st.rerun()
251
+ else:
252
+ st.error("Model Engine Failure.")
253
+ st.stop()
254
+
255
+ # --- Main Workspace ---
256
+
257
+ # Header
258
+ col_logo, col_space = st.columns([6, 1])
259
+ with col_logo:
260
+ if current_session['name'] == "New Chat":
261
+ st.markdown("# Annotation Assistant")
262
+ else:
263
+ st.markdown(f"# {current_session['name']}")
264
+
265
+ # Logic
266
+ if current_session['image'] is None:
267
+ # --- Upload State ---
268
+ st.markdown(
269
+ "<h3 style='text-align: center; color: #94A3B8; border: none;'>Upload an image to start this session</h3>",
270
+ unsafe_allow_html=True
271
+ )
272
+
273
+ uploaded_file = st.file_uploader(
274
+ "Upload Image",
275
+ type=["jpg", "png", "jpeg"],
276
+ key=f"uploader_{st.session_state.active_session_id}",
277
+ label_visibility="collapsed"
278
+ )
279
+
280
+ if uploaded_file:
281
+ image = Image.open(uploaded_file).convert("RGB")
282
+ current_session['image'] = image
283
+ current_session['name'] = uploaded_file.name
284
+ st.rerun()
285
+
286
+ else:
287
+ # --- Analysis State ---
288
+
289
+ # Image Controls
290
+ img_width = st.slider("Adjust View Size", 300, 1500, 700, 50, help="Drag to resize the image view")
291
+ st.markdown("<br>", unsafe_allow_html=True)
292
+
293
+ # 1. Main visual (Hero)
294
+ display_image = current_session['image'].copy()
295
+
296
+ if current_session['detections']:
297
+ display_image = utils.draw_boxes(display_image, current_session['detections'])
298
+
299
+ st.image(display_image, width=img_width)
300
+
301
+ # 2. Results Actions & Metrics
302
+ if current_session['detections']:
303
+ # Metrics Row
304
+ if current_session['metrics']:
305
+ m = current_session['metrics']
306
+ st.markdown(f"""
307
+ <div class='metric-pill'>
308
+ <span>Inference <span class='metric-value'>{m.get('inference_time', 0)}s</span></span>
309
+ <span style='color: #4B5563'>|</span>
310
+ <span>Total <span class='metric-value'>{m.get('total_time', 0)}s</span></span>
311
+ <span style='color: #4B5563'>|</span>
312
+ <span>Tokens <span class='metric-value'>{m.get('token_count', 0)}</span></span>
313
+ </div>
314
+ """, unsafe_allow_html=True)
315
+
316
+ # Download Row
317
+ c1, c2, c3 = st.columns([1, 1, 3]) # Bias to left
318
+ with c1:
319
+ # UPDATED: Pass usage metadata for Strict COCO compatibility
320
+ coco_json = utils.convert_to_coco(
321
+ current_session['detections'],
322
+ image_size=current_session['image'].size,
323
+ filename=current_session['name']
324
+ )
325
+ st.download_button("Download JSON", coco_json, "annotations.json", "application/json", use_container_width=True)
326
+ with c2:
327
+ zip_buffer = utils.create_crops_zip(current_session['image'], current_session['detections'])
328
+ st.download_button("Download ZIP", zip_buffer, "crops.zip", "application/zip", use_container_width=True)
329
+
330
+ # 3. Reasoning Stream (Below)
331
+ st.markdown("<div style='height: 20px;'></div>", unsafe_allow_html=True)
332
+ st.markdown("### AI Insights")
333
+ with st.container():
334
+ st.markdown("<div class='reasoning-container'>", unsafe_allow_html=True)
335
+ for det in current_session['detections'][::-1]:
336
+ label = det.get('label', 'Object')
337
+ reasoning = det.get('reasoning', None)
338
+ if not reasoning: reasoning = "Object detected based on visual features."
339
+ st.markdown(f"""
340
+ <div class='reasoning-card'>
341
+ <div class='reasoning-label'>{label}</div>
342
+ <div class='reasoning-text'>{reasoning}</div>
343
+ </div>
344
+ """, unsafe_allow_html=True)
345
+ st.markdown("</div>", unsafe_allow_html=True)
346
+
347
+ else:
348
+ # Image loaded but no detections
349
+ st.markdown(
350
+ "<div style='text-align: center; margin-top: 20px; color: #64748B; font-style: italic;'>"
351
+ "Waiting for instructions... Use the chat bar below."
352
+ "</div>",
353
+ unsafe_allow_html=True
354
+ )
355
+
356
+ # --- Floating Chat Bar ---
357
+ st.markdown("<br>", unsafe_allow_html=True)
358
+ prompt = st.chat_input("Describe objects to detect...")
359
+
360
+ if prompt:
361
+ if current_session['image'] is None:
362
+ st.error("Please upload an image first.")
363
+ else:
364
+ with st.status("Analyzing Scene...", expanded=True) as status:
365
+ detections, updated_history, raw_text, metrics = utils.get_bounding_boxes(
366
+ current_session['image'],
367
+ prompt,
368
+ current_session['history'],
369
+ st.session_state.processor,
370
+ st.session_state.model
371
+ )
372
+
373
+ if detections:
374
+ current_session['detections'] = utils.smart_merge_detections(current_session['detections'], detections)
375
+ current_session['history'] = updated_history
376
+ current_session['metrics'] = metrics
377
+ status.update(label="Complete", state="complete", expanded=False)
378
+ st.rerun()
379
+ else:
380
+ status.update(label="No matches found.", state="error", expanded=False)
381
+ st.toast(f"No match found.", icon="⚠️")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ transformers
3
+ torch
4
+ accelerate
5
+ pillow
6
+ opencv-python-headless
7
+ pyngrok
8
+ numpy
9
+ qwen_vl_utils
10
+ einops
utils.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import json
5
+ import time
6
+ import io
7
+ import zipfile
8
+ from PIL import Image, ImageDraw, ImageFont
9
+ from transformers import AutoProcessor, AutoModelForVision2Seq
10
+ import streamlit as st
11
+ import re
12
+
13
+ # Constants
14
+ MODEL_ID = "Qwen/Qwen3-VL-4B-Instruct"
15
+
16
+ @st.cache_resource
17
+ def load_model():
18
+ """
19
+ Loads the Qwen-VL model and processor.
20
+ """
21
+ print(f"Loading model: {MODEL_ID}...")
22
+ try:
23
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
24
+ model = AutoModelForVision2Seq.from_pretrained(
25
+ MODEL_ID,
26
+ device_map="auto",
27
+ trust_remote_code=True,
28
+ torch_dtype=torch.float16
29
+ )
30
+ except Exception as e:
31
+ print(f"Error loading {MODEL_ID}: {e}")
32
+ st.error(f"Could not load model {MODEL_ID}. Error: {e}")
33
+ return None, None
34
+
35
+ return processor, model
36
+
37
+ def get_bounding_boxes(image: Image.Image, prompt: str, history: list, processor, model):
38
+ """
39
+ Generates bounding boxes based on the image, prompt, and conversation history.
40
+ """
41
+ start_time = time.time()
42
+
43
+ if model is None or processor is None:
44
+ return [], history, "Model not loaded.", {}
45
+
46
+ # Construct conversation
47
+ messages = []
48
+
49
+ # Context
50
+ context_text = ""
51
+ if history:
52
+ context_text = "History:\n"
53
+ for msg in history:
54
+ role = "User" if msg['role'] == 'user' else "Assistant"
55
+ context_text += f"{role}: {msg['content']}\n"
56
+ context_text += "\n"
57
+
58
+ # Enhanced Prompt: JSON Focused With Reasoning
59
+ final_prompt = f"{context_text}User Request: {prompt}\n\nTask: Detect objects mentioned in the User Request.\nConstraint: Return the result ONLY as a JSON object with a key 'objects'.\nEach object in the list should have 'label', 'bbox' [x1, y1, x2, y2] (common normalized coordinates 0-1000), AND 'reasoning' (a brief string explaining why this object matches).\nExample: {{'objects': [{{'label': 'cat', 'bbox': [100, 200, 500, 600], 'reasoning': 'Detected distinct feline features and whiskers.'}}]}}\nIf no objects are found, return {{'objects': []}}."
60
+
61
+ messages = [
62
+ {
63
+ "role": "system",
64
+ "content": "You are a precise object detection assistant. Return JSON with 'objects' list containing 'label', 'bbox' [x1, y1, x2, y2] (common normalized coordinates 0-1000), and 'reasoning'."
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content": [
69
+ {"type": "image", "image": image},
70
+ {"type": "text", "text": final_prompt}
71
+ ]
72
+ }
73
+ ]
74
+
75
+ # Process inputs
76
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
77
+ image_inputs, video_inputs = process_vision_info(messages)
78
+
79
+ try:
80
+ inputs = processor(
81
+ text=[text],
82
+ images=image_inputs,
83
+ videos=video_inputs,
84
+ padding=True,
85
+ return_tensors="pt",
86
+ )
87
+ inputs = inputs.to(model.device)
88
+
89
+ # Generate (Measured)
90
+ generate_start = time.time()
91
+ generated_ids = model.generate(**inputs, max_new_tokens=512)
92
+ generate_end = time.time()
93
+
94
+ generated_ids_trimmed = [
95
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
96
+ ]
97
+ output_text = processor.batch_decode(
98
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
99
+ )[0]
100
+
101
+ except Exception as e:
102
+ print(f"Inference Error: {e}")
103
+ output_text = f"Error: {e}"
104
+ generate_end = time.time()
105
+
106
+ # Update history
107
+ history.append({"role": "user", "content": prompt})
108
+ history.append({"role": "assistant", "content": output_text})
109
+
110
+ # Parse detections
111
+ detections = parse_qwen_output(output_text, image.width, image.height)
112
+
113
+ # Filter
114
+ filtered_detections = []
115
+ total_area = image.width * image.height
116
+
117
+ for det in detections:
118
+ x1, y1, x2, y2 = det['box']
119
+ box_area = (x2 - x1) * (y2 - y1)
120
+ coverage = box_area / total_area
121
+
122
+ is_suspicious_coverage = coverage > 0.95
123
+ is_whole_request = any(w in prompt.lower() for w in ["image", "picture", "photo", "background", "everything"])
124
+
125
+ if is_suspicious_coverage and not is_whole_request:
126
+ continue
127
+
128
+ filtered_detections.append(det)
129
+
130
+ # Metrics
131
+ end_time = time.time()
132
+ total_time = end_time - start_time
133
+ inference_time = generate_end - generate_start
134
+
135
+ metrics = {
136
+ "total_time": round(total_time, 2),
137
+ "inference_time": round(inference_time, 2),
138
+ "token_count": len(generated_ids[0]) if 'generated_ids' in locals() else 0
139
+ }
140
+
141
+ return filtered_detections, history, output_text, metrics
142
+
143
+ def smart_merge_detections(existing_detections, new_detections):
144
+ """
145
+ Merges new detections with existing ones.
146
+ Strategy: SIMPLE OVERLAP ONLY.
147
+ If IoU > 0.8 -> Assume duplicate/refinement -> Replace.
148
+ Else -> Keep.
149
+ """
150
+ merged_list = existing_detections.copy()
151
+
152
+ for new_det in new_detections:
153
+ new_box = new_det['box']
154
+ indices_to_remove = []
155
+
156
+ for i, old_det in enumerate(merged_list):
157
+ old_box = old_det['box']
158
+ iou = calculate_iou(new_box, old_box)
159
+
160
+ # Simple threshold check
161
+ if iou > 0.8:
162
+ indices_to_remove.append(i)
163
+
164
+ for idx in sorted(indices_to_remove, reverse=True):
165
+ merged_list.pop(idx)
166
+
167
+ merged_list.append(new_det)
168
+
169
+ return merged_list
170
+
171
+ def calculate_iou(boxA, boxB):
172
+ xA = max(boxA[0], boxB[0])
173
+ yA = max(boxA[1], boxB[1])
174
+ xB = min(boxA[2], boxB[2])
175
+ yB = min(boxA[3], boxB[3])
176
+
177
+ interArea = max(0, xB - xA) * max(0, yB - yA)
178
+ boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
179
+ boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
180
+
181
+ iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)
182
+ return iou
183
+
184
+ def parse_qwen_output(text, width, height):
185
+ """
186
+ Parses Qwen-VL output, prioritizing JSON with reasoning.
187
+ """
188
+ detections = []
189
+
190
+ # 1. Try JSON Parsing (Primary Strategy)
191
+ try:
192
+ match = re.search(r'\{.*\}', text, re.DOTALL)
193
+ if match:
194
+ json_str = match.group()
195
+ data = json.loads(json_str)
196
+
197
+ if 'objects' in data and isinstance(data['objects'], list):
198
+ for obj in data['objects']:
199
+ x1, y1, x2, y2 = obj['bbox']
200
+ label = obj.get('label', 'Object')
201
+ reasoning = obj.get('reasoning', 'No reasoning provided')
202
+
203
+ real_x1 = (x1 / 1000) * width
204
+ real_y1 = (y1 / 1000) * height
205
+ real_x2 = (x2 / 1000) * width
206
+ real_y2 = (y2 / 1000) * height
207
+
208
+ detections.append({
209
+ "label": label,
210
+ "box": [real_x1, real_y1, real_x2, real_y2],
211
+ "score": 1.0,
212
+ "reasoning": reasoning
213
+ })
214
+ except Exception as e:
215
+ print(f"JSON Parse Error: {e}")
216
+ pass
217
+
218
+ # 2. Fallback to Standard Tags
219
+ if not detections:
220
+ pattern_standard = r"<\|box_start\|>(\d+),(\d+),(\d+),(\d+)<\|box_end\|>(?:<\|object_start\|>(.*?)<\|object_end\|>)?"
221
+ matches_standard = list(re.finditer(pattern_standard, text))
222
+ for match in matches_standard:
223
+ c1, c2, c3, c4 = map(int, match.groups()[:4])
224
+ label = match.group(5) if match.group(5) else "Object"
225
+ y1 = (c1 / 1000) * height
226
+ x1 = (c2 / 1000) * width
227
+ y2 = (c3 / 1000) * height
228
+ x2 = (c4 / 1000) * width
229
+ detections.append({
230
+ "label": label,
231
+ "box": [x1, y1, x2, y2],
232
+ "score": 1.0,
233
+ "reasoning": "Legacy detection mode"
234
+ })
235
+
236
+ return detections
237
+
238
+ def create_crops_zip(image: Image.Image, detections: list):
239
+ """
240
+ Creates a ZIP file containing cropped images of all detections.
241
+ """
242
+ zip_buffer = io.BytesIO()
243
+
244
+ # Ensure distinct filenames
245
+ counts = {}
246
+
247
+ with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
248
+ for i, det in enumerate(detections):
249
+ label = det.get('label', 'object').replace(" ", "_").lower()
250
+ if label not in counts:
251
+ counts[label] = 1
252
+ else:
253
+ counts[label] += 1
254
+ label = f"{label}_{counts[label]}"
255
+
256
+ x1, y1, x2, y2 = map(int, det['box'])
257
+ x1 = max(0, x1)
258
+ y1 = max(0, y1)
259
+ x2 = min(image.width, x2)
260
+ y2 = min(image.height, y2)
261
+
262
+ if x2 > x1 and y2 > y1:
263
+ crop = image.crop((x1, y1, x2, y2))
264
+ crop_buffer = io.BytesIO()
265
+ crop.save(crop_buffer, format="JPEG")
266
+ zip_file.writestr(f"{label}.jpg", crop_buffer.getvalue())
267
+
268
+ zip_buffer.seek(0)
269
+ return zip_buffer
270
+
271
+ def process_vision_info(messages):
272
+ try:
273
+ from qwen_vl_utils import process_vision_info
274
+ return process_vision_info(messages)
275
+ except ImportError:
276
+ images = []
277
+ for msg in messages:
278
+ for item in msg["content"]:
279
+ if item["type"] == "image":
280
+ images.append(item["image"])
281
+ return images, None
282
+
283
+ def draw_boxes(image: Image.Image, detections: list):
284
+ """
285
+ Draws bounding boxes with dynamic font scaling.
286
+ """
287
+ draw = ImageDraw.Draw(image)
288
+
289
+ # Dynamic Scaling (UPDATED FOR BETTER VISIBILITY)
290
+ min_dim = min(image.width, image.height)
291
+ scaled_font_size = max(20, int(min_dim * 0.035))
292
+ scaled_line_width = max(4, int(min_dim * 0.006))
293
+
294
+ font = None
295
+ try:
296
+ font_names = ["arial.ttf", "LiberationSans-Regular.ttf", "DejaVuSans.ttf"]
297
+ for fn in font_names:
298
+ try:
299
+ font = ImageFont.truetype(fn, scaled_font_size)
300
+ break
301
+ except:
302
+ continue
303
+ except:
304
+ pass
305
+
306
+ if font is None:
307
+ try:
308
+ font = ImageFont.load_default()
309
+ except:
310
+ pass
311
+
312
+ palette = [
313
+ "#FF00FF", "#00FFFF", "#FF0000", "#00FF00",
314
+ "#FFFF00", "#FFA500", "#800080", "#008080"
315
+ ]
316
+
317
+ def get_color(text):
318
+ if not text: return palette[0]
319
+ idx = sum(ord(c) for c in text) % len(palette)
320
+ return palette[idx]
321
+
322
+ for det in detections:
323
+ box = det['box']
324
+ label = det.get('label', 'Object')
325
+ score_val = det.get('score', 1.0)
326
+ display_text = f"{label} {int(score_val*100)}%"
327
+
328
+ color = get_color(label)
329
+
330
+ x1, y1, x2, y2 = box
331
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=scaled_line_width)
332
+
333
+ # Text box
334
+ if font:
335
+ text_bbox = draw.textbbox((x1, y1), display_text, font=font)
336
+ text_width = text_bbox[2] - text_bbox[0]
337
+ text_height = text_bbox[3] - text_bbox[1]
338
+
339
+ label_y = y1 - text_height - (scaled_line_width * 2)
340
+ if label_y < 0: label_y = y1
341
+
342
+ draw.rectangle(
343
+ [x1, label_y, x1 + text_width + (scaled_line_width * 4), label_y + text_height + (scaled_line_width * 2)],
344
+ fill=color
345
+ )
346
+ draw.text((x1 + (scaled_line_width), label_y), display_text, fill="black", font=font)
347
+
348
+ return image
349
+
350
+ def convert_to_coco(detections, image_size=(1000, 1000), filename="image.jpg"):
351
+ """
352
+ Converts detections to full Standard COCO JSON format.
353
+ """
354
+ width, height = image_size
355
+
356
+ # 1. Info
357
+ info = {
358
+ "year": 2025,
359
+ "version": "1.0",
360
+ "description": "Generated by Annotation Assistant (Qwen-VL)",
361
+ "date_created": time.strftime("%Y-%m-%d")
362
+ }
363
+
364
+ # 2. Images
365
+ images = [{
366
+ "id": 1,
367
+ "width": width,
368
+ "height": height,
369
+ "file_name": filename,
370
+ "license": 0,
371
+ "flickr_url": "",
372
+ "coco_url": "",
373
+ "date_captured": 0
374
+ }]
375
+
376
+ # 3. Categories & Annotations
377
+ categories = []
378
+ category_map = {}
379
+ annotations = []
380
+ cat_id_counter = 1
381
+
382
+ for i, det in enumerate(detections):
383
+ label = det.get('label', 'object')
384
+
385
+ # Manage Categories
386
+ if label not in category_map:
387
+ category_map[label] = cat_id_counter
388
+ categories.append({
389
+ "id": cat_id_counter,
390
+ "name": label,
391
+ "supercategory": "object"
392
+ })
393
+ cat_id_counter += 1
394
+
395
+ x1, y1, x2, y2 = det['box']
396
+ w = x2 - x1
397
+ h = y2 - y1
398
+
399
+ ann = {
400
+ "id": i + 1,
401
+ "image_id": 1,
402
+ "category_id": category_map[label],
403
+ "bbox": [round(x1, 2), round(y1, 2), round(w, 2), round(h, 2)],
404
+ "area": round(w * h, 2),
405
+ "iscrowd": 0,
406
+ "attributes": {
407
+ "reasoning": det.get('reasoning', '')
408
+ }
409
+ }
410
+ annotations.append(ann)
411
+
412
+ coco_output = {
413
+ "info": info,
414
+ "images": images,
415
+ "annotations": annotations,
416
+ "categories": categories,
417
+ "licenses": []
418
+ }
419
+
420
+ return json.dumps(coco_output, indent=2)