Spaces:

sergiopaniego
/

vlm_object_understanding

Running on Zero

App Files Files Community

sergiopaniego HF Staff

ariG23498 HF Staff commited on 13 days ago

Commit

c785294

verified ·

1 Parent(s): e80a30e

[Update] Demo updated with suggestive prompting, and pinning of packages (#5)

Browse files

- update demo with suggestive prompting (8b69117e3279dc7d26728fe43d34ee22ce575434)
- pinning the libraries (5d44beee15caad5daf575c663b125e6d8ec12294)

Co-authored-by: Aritra Roy Gosthipaty <[email protected]>

Files changed (2) hide show

app.py +261 -348
requirements.txt +7 -10

app.py CHANGED Viewed

@@ -1,227 +1,140 @@
-import json
-import time
 import gradio as gr
-import numpy as np
 from gradio.themes.ocean import Ocean
-from PIL import Image
-from qwen_vl_utils import process_vision_info
 from transformers import (
     AutoModelForCausalLM,
-    AutoProcessor,
     Qwen3VLForConditionalGeneration,
 )
 from spaces import GPU
-import supervision as sv
-model_qwen_id = "Qwen/Qwen3-VL-4B-Instruct"
-model_moondream_id = "moondream/moondream3-preview"
-model_qwen = Qwen3VLForConditionalGeneration.from_pretrained(
-    model_qwen_id, torch_dtype="auto", device_map="auto",
-)
-model_moondream = AutoModelForCausalLM.from_pretrained(
-    model_moondream_id,
     trust_remote_code=True,
-    device_map={"": "cuda"},
 )
-def extract_model_short_name(model_id):
-    return model_id.split("/")[-1].replace("-", " ").replace("_", " ")
-model_qwen_name = extract_model_short_name(model_qwen_id)
-model_moondream_name = extract_model_short_name(model_moondream_id)
-processor_qwen = AutoProcessor.from_pretrained(model_qwen_id)
-def create_annotated_image(image, json_data, height, width):
     try:
-        parsed_json_data = json_data.split("```json")[1].split("```")[0]
-        bbox_data = json.loads(parsed_json_data)
     except Exception:
-        return image
-    original_width, original_height = image.size
-    x_scale = original_width / width
-    y_scale = original_height / height
-    points = []
-    point_labels = []
-    for item in bbox_data:
-        label = item.get("label", "")
-        if "point_2d" in item:
-            x, y = item["point_2d"]
-            scaled_x = int(x * x_scale)
-            scaled_y = int(y * y_scale)
-            points.append([scaled_x, scaled_y])
-            point_labels.append(label)
-        annotated_image = np.array(image.convert("RGB"))
-        detections = sv.Detections.from_vlm(vlm = sv.VLM.QWEN_2_5_VL,
-                                            result=json_data,
-                                            input_wh=(original_width,
-                                                      original_height),
-                                            resolution_wh=(original_width,
-                                                           original_height))
-        bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
-        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
-        annotated_image = bounding_box_annotator.annotate(
-            scene=annotated_image, detections=detections
-        )
-        annotated_image = label_annotator.annotate(
-            scene=annotated_image, detections=detections
-        )
-    if points:
-        points_array = np.array(points).reshape(1, -1, 2)
-        key_points = sv.KeyPoints(xy=points_array)
-        vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE)
-        # vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2)
-        annotated_image = vertex_annotator.annotate(
-            scene=annotated_image, key_points=key_points
         )
-        # annotated_image = vertex_label_annotator.annotate(
-        #     scene=annotated_image,
-        #     key_points=key_points,
-        #     labels=point_labels
-        # )
-    return Image.fromarray(annotated_image)
-def create_annotated_image_normalized(image, json_data, label="object"):
-    if not isinstance(json_data, dict):
-        return image
     original_width, original_height = image.size
-    annotated_image = np.array(image.convert("RGB"))
-    points = []
-    if "points" in json_data:
-        for point in json_data.get("points", []):
             x = int(point["x"] * original_width)
             y = int(point["y"] * original_height)
-            points.append([x, y])
-    if "reasoning" in json_data:
-        for grounding in json_data["reasoning"].get("grounding", []):
-            for x_norm, y_norm in grounding.get("points", []):
-                x = int(x_norm * original_width)
-                y = int(y_norm * original_height)
-                points.append([x, y])
-    if points:
-        points_array = np.array(points).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
-        vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
         annotated_image = vertex_annotator.annotate(
-            scene=annotated_image, key_points=key_points
-        )
-    if "objects" in json_data:
-        detections = sv.Detections.from_vlm(sv.VLM.MOONDREAM,json_data,
-                                            resolution_wh=(original_width,
-                                                           original_height))
-        bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
-        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
-        labels = [label for _ in detections.xyxy]
-        annotated_image = bounding_box_annotator.annotate(
-            scene=annotated_image, detections=detections
         )
-        annotated_image = label_annotator.annotate(
-            scene=annotated_image, detections=detections, labels=labels
         )
-    return Image.fromarray(annotated_image)
-def parse_qwen3_json(json_output):
-    lines = json_output.splitlines()
-    for i, line in enumerate(lines):
-        if line == "```json":
-            json_output = "\n".join(lines[i+1:])
-            json_output = json_output.split("```")[0]
-            break
-    try:
-        boxes = json.loads(json_output)
-    except json.JSONDecodeError:
-        end_idx = json_output.rfind('"}') + len('"}')
-        truncated_text = json_output[:end_idx] + "]"
-        boxes = json.loads(truncated_text)
-    if not isinstance(boxes, list):
-        boxes = [boxes]
-    return boxes
-def create_annotated_image_qwen3(image, json_output):
-    try:
-        boxes = parse_qwen3_json(json_output)
-    except Exception as e:
-        print(f"Error parsing JSON: {e}")
-        return image
-    if not boxes:
-        return image
-    original_width, original_height = image.size
-    annotated_image = np.array(image.convert("RGB"))
-    xyxy = []
-    labels = []
-    for box in boxes:
-        if "bbox_2d" in box and "label" in box:
-            x1, y1, x2, y2 = box["bbox_2d"]
-            scale = 1000
-            x1 = max(0, min(scale, x1)) / scale * original_width
-            y1 = max(0, min(scale, y1)) / scale * original_height
-            x2 = max(0, min(scale, x2)) / scale * original_width
-            y2 = max(0, min(scale, y2)) / scale * original_height
-            # Ensure x1 <= x2 and y1 <= y2
-            if x1 > x2: x1, x2 = x2, x1
-            if y1 > y2: y1, y2 = y2, y1
-            xyxy.append([int(x1), int(y1), int(x2), int(y2)])
-            labels.append(box["label"])
-    if not xyxy:
-        return image
-    detections = sv.Detections(
-        xyxy=np.array(xyxy),
-        class_id=np.arange(len(xyxy))
-    )
-    bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
-    label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
-    annotated_image = bounding_box_annotator.annotate(
-        scene=annotated_image, detections=detections
-    )
-    annotated_image = label_annotator.annotate(
-        scene=annotated_image, detections=detections, labels=labels
-    )
-    return Image.fromarray(annotated_image)
-@GPU
-def detect_qwen(image, prompt):
     messages = [
         {
             "role": "user",
@@ -231,75 +144,132 @@ def detect_qwen(image, prompt):
             ],
         }
     ]
-    t0 = time.perf_counter()
-    inputs = processor_qwen.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_dict=True,
-        return_tensors="pt"
-    ).to(model_qwen.device)
-    generated_ids = model_qwen.generate(**inputs, max_new_tokens=1024)
     generated_ids_trimmed = [
         out_ids[len(in_ids) :]
         for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
-    output_text = processor_qwen.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
     )[0]
-    elapsed_ms = (time.perf_counter() - t0) * 1_000
-    annotated_image = create_annotated_image_qwen3(image, output_text)
-    time_taken = f"**Inference time ({model_qwen_name}):** {elapsed_ms:.0f} ms"
-    return annotated_image, output_text, time_taken
 @GPU
-def detect_moondream(image, prompt, category_input):
-    t0 = time.perf_counter()
-    if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
-        output_text = model_moondream.detect(image=image, object=prompt)
-    elif category_input == "Visual Grounding + Keypoint Detection":
-        output_text = model_moondream.point(image=image, object=prompt)
     else:
-        output_text = model_moondream.query(
-            image=image, question=prompt, reasoning=True
-        )
-    elapsed_ms = (time.perf_counter() - t0) * 1_000
-    annotated_image = create_annotated_image_normalized(
-        image=image, json_data=output_text, label="object"
-    )
-    time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
-    return annotated_image, output_text, time_taken
-def detect(image, prompt_model_1, prompt_model_2, category_input):
-    STANDARD_SIZE = (1024, 1024)
-    image.thumbnail(STANDARD_SIZE)
-    annotated_image_model_1, output_text_model_1, timing_1 = detect_qwen(
-        image, prompt_model_1
-    )
-    annotated_image_model_2, output_text_model_2, timing_2 = detect_moondream(
-        image, prompt_model_2, category_input
-    )
-    return (
-        annotated_image_model_1,
-        output_text_model_1,
-        timing_1,
-        annotated_image_model_2,
-        output_text_model_2,
-        timing_2,
-    )
 css_hide_share = """
@@ -308,6 +278,7 @@ button#gradio-share-link-button-0 {
 }
 """
 with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
     gr.Markdown("# 👓 Object Understanding with Vision Language Models")
     gr.Markdown(
@@ -319,130 +290,72 @@ with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
     """)
     with gr.Row():
-        with gr.Column(scale=2):
-            image_input = gr.Image(label="Upload an image", type="pil", height=400)
-            prompt_input_model_1 = gr.Textbox(
-                label=f"Enter your prompt for {model_qwen_name}",
-                placeholder="e.g., Detect all red cars in the image",
             )
-            prompt_input_model_2 = gr.Textbox(
-                label=f"Enter your prompt for {model_moondream_name}",
-                placeholder="e.g., Detect all blue cars in the image",
             )
-            categories = [
-                "Object Detection",
-                "Object Counting",
-                "Visual Grounding + Keypoint Detection",
-                "Visual Grounding + Object Detection",
-                "General query",
-            ]
-            category_input = gr.Dropdown(
-                choices=categories, label="Category", interactive=True
             )
-            generate_btn = gr.Button(value="Generate")
-        with gr.Column(scale=1):
-            output_image_model_1 = gr.Image(
-                type="pil", label=f"Annotated image for {model_qwen_name}", height=400
-            )
-            output_textbox_model_1 = gr.Textbox(
-                label=f"Model response for {model_qwen_name}", lines=10
-            )
-            output_time_model_1 = gr.Markdown()
-        with gr.Column(scale=1):
-            output_image_model_2 = gr.Image(
-                type="pil",
-                label=f"Annotated image for {model_moondream_name}",
-                height=400,
-            )
-            output_textbox_model_2 = gr.Textbox(
-                label=f"Model response for {model_moondream_name}", lines=10
-            )
-            output_time_model_2 = gr.Markdown()
-    gr.Markdown("### Examples")
-    example_prompts = [
-        [
-            "examples/example_1.jpg",
-            "locate every instance in the image. Report bbox coordinates in JSON format.",
-            "objects",
-            "Object Detection",
-        ],
-        [
-            "examples/example_2.JPG",
-            'locate every instance that belongs to the following categories: "candy, hand". Report bbox coordinates in JSON format.',
-            "candies",
-            "Object Detection",
-        ],
-        [
-            "examples/example_1.jpg",
-            "Count the number of red cars in the image.",
-            "Count the number of red cars in the image.",
-            "Object Counting",
-        ],
-        [
-            "examples/example_2.JPG",
-            "Count the number of blue candies in the image.",
-            "Count the number of blue candies in the image.",
-            "Object Counting",
-        ],
-        [
-            "examples/example_1.jpg",
-            'locate every instance that belongs to the following categories: "red car". Report bbox coordinates in JSON format..',
-            "red cars",
-            "Visual Grounding + Keypoint Detection",
-        ],
-        [
-            "examples/example_2.JPG",
-            "Identify the blue candies in this image, detect their key points and return their positions in the form of points.",
-            "blue candies",
-            "Visual Grounding + Keypoint Detection",
-        ],
-        [
-            "examples/example_1.jpg",
-            'locate every instance that belongs to the following categories: "leading red car". Report bbox coordinates in JSON format..',
-            "leading red car",
-            "Visual Grounding + Object Detection",
-        ],
-        [
-            "examples/example_2.JPG",
-            'locate every instance that belongs to the following categories: "blue candy located at the top of the group". Report bbox coordinates in JSON format.',
-            "blue candy located at the top of the group",
-            "Visual Grounding + Object Detection",
-        ],
-    ]
     gr.Examples(
-         examples=example_prompts,
-         inputs=[
-             image_input,
-             prompt_input_model_1,
-             prompt_input_model_2,
-             category_input,
-         ],
-         label="Click an example to populate the input",
     )
-    generate_btn.click(
-        fn=detect,
-        inputs=[
-            image_input,
-            prompt_input_model_1,
-            prompt_input_model_2,
-            category_input,
-        ],
-        outputs=[
-            output_image_model_1,
-            output_textbox_model_1,
-            output_time_model_1,
-            output_image_model_2,
-            output_textbox_model_2,
-            output_time_model_2,
-        ],
     )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from gradio.themes.ocean import Ocean
+import torch
+import numpy as np
+import supervision as sv
 from transformers import (
     AutoModelForCausalLM,
     Qwen3VLForConditionalGeneration,
+    Qwen3VLProcessor,
 )
+import json
+import ast
+import re
+from PIL import Image
 from spaces import GPU
+# --- Constants and Configuration ---
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = "auto"
+CATEGORIES = ["Query", "Caption", "Point", "Detect"]
+PLACEHOLDERS = {
+    "Query": "What's in this image?",
+    "Caption": "Enter caption length: short, normal, or long",
+    "Point": "Select an object from suggestions or enter manually",
+    "Detect": "Select an object from suggestions or enter manually",
+}
+# --- Model Loading ---
+# Load Moondream
+moondream = AutoModelForCausalLM.from_pretrained(
+    "moondream/moondream3-preview",
     trust_remote_code=True,
+    dtype=DTYPE,
+    device_map=DEVICE,
+    revision="main",
+).eval()
+# Load Qwen3-VL
+qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen3-VL-4B-Instruct",
+    dtype=DTYPE,
+    device_map=DEVICE,
+).eval()
+qwen_processor = Qwen3VLProcessor.from_pretrained(
+    "Qwen/Qwen3-VL-4B-Instruct",
 )
+# --- Utility Functions ---
+def safe_parse_json(text: str):
+    text = text.strip()
+    text = re.sub(r"^```(json)?", "", text)
+    text = re.sub(r"```$", "", text)
+    text = text.strip()
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
     try:
+        return ast.literal_eval(text)
     except Exception:
+        return {}
+@GPU
+def get_suggested_objects(image: Image.Image):
+    """Get suggested objects in the image using Moondream"""
+    if image is None:
+        return []
+    try:
+        result = moondream.query(
+            image=image,
+            question="What objects are in the image, provide the list.",
+            reasoning=False,
         )
+        suggested_objects = ast.literal_eval(result["answer"])
+        if isinstance(suggested_objects, list):
+            if len(suggested_objects) > 3:  # send not more than 3 suggestions
+                return suggested_objects[:3]
+            else:
+                suggested_objects
+        return []
+    except Exception as e:
+        print(f"Error getting suggestions: {e}")
+        return []
+def annotate_image(image: Image.Image, result: dict):
+    if not isinstance(image, Image.Image):
+        return image  # Return original if not a valid image
+    if not isinstance(result, dict):
+        return image  # Return original if result is not a dict
     original_width, original_height = image.size
+    # Handle Point annotations
+    if "points" in result and result["points"]:
+        points_list = []
+        for point in result.get("points", []):
             x = int(point["x"] * original_width)
             y = int(point["y"] * original_height)
+            points_list.append([x, y])
+        if not points_list:
+            return image
+        points_array = np.array(points_list).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
+        vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
         annotated_image = vertex_annotator.annotate(
+            scene=image.copy(), key_points=key_points
         )
+        return annotated_image
+    # Handle Detection annotations
+    if "objects" in result and result["objects"]:
+        detections = sv.Detections.from_vlm(
+            sv.VLM.MOONDREAM,
+            result,
+            resolution_wh=image.size,
         )
+        if len(detections) == 0:
+            return image
+        box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=5)
+        annotated_scene = box_annotator.annotate(
+            scene=image.copy(), detections=detections
+        )
+        return annotated_scene
+    return image
+# --- Inference Functions ---
+def run_qwen_inference(image: Image.Image, prompt: str):
     messages = [
         {
             "role": "user",
             ],
         }
     ]
+    inputs = qwen_processor.apply_chat_template(
         messages,
         tokenize=True,
         add_generation_prompt=True,
         return_dict=True,
+        return_tensors="pt",
+    ).to(DEVICE)
+    with torch.inference_mode():
+        generated_ids = qwen_model.generate(
+            **inputs,
+            max_new_tokens=512,
+        )
     generated_ids_trimmed = [
         out_ids[len(in_ids) :]
         for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
+    output_text = qwen_processor.batch_decode(
         generated_ids_trimmed,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
     )[0]
+    return output_text
+@GPU
+def process_qwen(image: Image.Image, category: str, prompt: str):
+    if category == "Query":
+        return run_qwen_inference(image, prompt), {}
+    elif category == "Caption":
+        full_prompt = f"Provide a {prompt} length caption for the image."
+        return run_qwen_inference(image, full_prompt), {}
+    elif category == "Point":
+        full_prompt = (
+            f"Provide 2d point coordinates for {prompt}. Report in JSON format."
+        )
+        output_text = run_qwen_inference(image, full_prompt)
+        parsed_json = safe_parse_json(output_text)
+        points_result = {"points": []}
+        if isinstance(parsed_json, list):
+            for item in parsed_json:
+                if "point_2d" in item and len(item["point_2d"]) == 2:
+                    x, y = item["point_2d"]
+                    points_result["points"].append({"x": x / 1000.0, "y": y / 1000.0})
+        return json.dumps(points_result, indent=2), points_result
+    elif category == "Detect":
+        full_prompt = (
+            f"Provide bounding box coordinates for {prompt}. Report in JSON format."
+        )
+        output_text = run_qwen_inference(image, full_prompt)
+        parsed_json = safe_parse_json(output_text)
+        objects_result = {"objects": []}
+        if isinstance(parsed_json, list):
+            for item in parsed_json:
+                if "bbox_2d" in item and len(item["bbox_2d"]) == 4:
+                    xmin, ymin, xmax, ymax = item["bbox_2d"]
+                    objects_result["objects"].append(
+                        {
+                            "x_min": xmin / 1000.0,
+                            "y_min": ymin / 1000.0,
+                            "x_max": xmax / 1000.0,
+                            "y_max": ymax / 1000.0,
+                        }
+                    )
+        return json.dumps(objects_result, indent=2), objects_result
+    return "Invalid category", {}
 @GPU
+def process_moondream(image: Image.Image, category: str, prompt: str):
+    if category == "Query":
+        result = moondream.query(image=image, question=prompt)
+        return result["answer"], {}
+    elif category == "Caption":
+        result = moondream.caption(image, length=prompt)
+        return result["caption"], {}
+    elif category == "Point":
+        result = moondream.point(image, prompt)
+        return json.dumps(result, indent=2), result
+    elif category == "Detect":
+        result = moondream.detect(image, prompt)
+        return json.dumps(result, indent=2), result
+    return "Invalid category", {}
+# --- Gradio Interface Logic ---
+def on_category_and_image_change(image, category):
+    """Generate suggestions when category changes to Point or Detect"""
+    text_box = gr.Textbox(value="", placeholder=PLACEHOLDERS.get(category, ""), interactive=True)
+    if image is None or category not in ["Point", "Detect", "Caption"]:
+        return gr.Radio(choices=[], visible=False), text_box
+    if category == "Caption":
+        return gr.Radio(choices=["short", "normal", "long"], visible=True), text_box
+    suggestions = get_suggested_objects(image)
+    if suggestions:
+        return gr.Radio(choices=suggestions, visible=True, interactive=True), text_box
     else:
+        return gr.Radio(choices=["no choice possible"], visible=True, interactive=True), text_box
+def update_prompt_from_radio(selected_object):
+    """Update prompt textbox when a radio option is selected"""
+    if selected_object:
+        return gr.Textbox(value=selected_object)
+    return gr.Textbox(value="")
+def process_inputs(image, category, prompt):
+    if image is None:
+        raise gr.Error("Please upload an image.")
+    if not prompt:
+        raise gr.Error("Please provide a prompt.")
+    # Process with Qwen
+    qwen_text, qwen_data = process_qwen(image, category, prompt)
+    qwen_annotated_image = annotate_image(image, qwen_data)
+    # Process with Moondream
+    moondream_text, moondream_data = process_moondream(image, category, prompt)
+    moondream_annotated_image = annotate_image(image, moondream_data)
+    return qwen_annotated_image, qwen_text, moondream_annotated_image, moondream_text
 css_hide_share = """
 }
 """
+# --- Gradio UI Layout ---
 with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
     gr.Markdown("# 👓 Object Understanding with Vision Language Models")
     gr.Markdown(
     """)
     with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Input Image")
+            category_select = gr.Radio(
+                choices=CATEGORIES,
+                value=CATEGORIES[0],
+                label="Select Task Category",
+                interactive=True,
             )
+            # Suggested objects radio (hidden by default)
+            suggestions_radio = gr.Radio(
+                choices=[],
+                label="Suggestions",
+                visible=False,
+                interactive=True,
             )
+            prompt_input = gr.Textbox(
+                placeholder=PLACEHOLDERS[CATEGORIES[0]],
+                label="Prompt",
+                lines=2,
             )
+            submit_btn = gr.Button("Compare Models", variant="primary")
+        with gr.Column(scale=2):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Qwen/Qwen3-VL-4B-Instruct")
+                    qwen_img_output = gr.Image(label="Annotated Image")
+                    qwen_text_output = gr.Textbox(
+                        label="Text Output", lines=8, interactive=False
+                    )
+                with gr.Column():
+                    gr.Markdown("### moondream/moondream3-preview")
+                    moon_img_output = gr.Image(label="Annotated Image")
+                    moon_text_output = gr.Textbox(
+                        label="Text Output", lines=8, interactive=False
+                    )
     gr.Examples(
+        examples=[
+            ["examples/example_1.jpg", "Query", "How many cars are in the image?"],
+            ["examples/example_1.jpg", "Caption", ""],
+            ["examples/example_2.JPG", "Point", ""],
+            ["examples/example_2.JPG", "Detect", ""],
+        ],
+        inputs=[image_input, category_select, prompt_input],
     )
+    # --- Event Listeners ---
+    category_select.change(
+        fn=on_category_and_image_change,
+        inputs=[image_input, category_select],
+        outputs=[suggestions_radio, prompt_input],
+    )
+    suggestions_radio.change(
+        fn=update_prompt_from_radio,
+        inputs=[suggestions_radio],
+        outputs=[prompt_input],
+    )
+    submit_btn.click(
+        fn=process_inputs,
+        inputs=[image_input, category_select, prompt_input],
+        outputs=[qwen_img_output, qwen_text_output, moon_img_output, moon_text_output],
     )
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,10 +1,7 @@
-torch
-transformers
-datasets
-Pillow
-gradio
-accelerate
-qwen-vl-utils
-torchvision
-matplotlib
-supervision

+torch==2.8.0
+transformers==4.57.0
+Pillow==11.3.0
+gradio==5.49.1
+accelerate==1.10.1
+torchvision==0.23.0
+supervision==0.26.1