Spaces:

maxiw
/

Qwen2-VL-Detection

Running on Zero

App Files Files Community

maxiw commited on Sep 2, 2024

Commit

64036af

1 Parent(s): 828c61d

WIP fix image loading

Browse files

Files changed (2) hide show

app.py +14 -5
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -1,13 +1,15 @@
 import gradio as gr
 import spaces
-from transformers import AutoModelForCausalLM, AutoProcessor
 import torch
 from PIL import Image
-import subprocess
 models = {
-    "Qwen/Qwen2-VL-7B-Instruct": AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
 }
 processors = {
@@ -17,6 +19,13 @@ processors = {
 DESCRIPTION = "# Qwen2-VL Object Localization Demo"
 @spaces.GPU
 def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     model = models[model_id].eval().cuda()
@@ -26,7 +35,7 @@ def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
         {
             "role": "user",
             "content": [
-                {"type": "image", "image": image},
                 {"type": "text", "text": f"Give a bounding box for {text_input}"},
             ],
         }
@@ -67,7 +76,7 @@ with gr.Blocks(css=css) as demo:
     with gr.Tab(label="Qwen2-VL Input"):
         with gr.Row():
             with gr.Column():
-                input_img = gr.Image(label="Input Picture")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
                 text_input = gr.Textbox(label="Description of Localization Target")
                 submit_btn = gr.Button(value="Submit")

 import gradio as gr
 import spaces
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
 import torch
+import base64
 from PIL import Image
+from io import BytesIO
 models = {
+    "Qwen/Qwen2-VL-7B-Instruct": Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") #, torch_dtype="auto", device_map="auto")
 }
 processors = {
 DESCRIPTION = "# Qwen2-VL Object Localization Demo"
+def image_to_base64(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")  # Save the image in memory as PNG
+    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")  # Encode image to base64
+    return img_str
 @spaces.GPU
 def run_example(image, text_input, model_id="Qwen/Qwen2-VL-7B-Instruct"):
     model = models[model_id].eval().cuda()
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": f"data:image;base64,{image_to_base64(image)}"},
                 {"type": "text", "text": f"Give a bounding box for {text_input}"},
             ],
         }
     with gr.Tab(label="Qwen2-VL Input"):
         with gr.Row():
             with gr.Column():
+                input_img = gr.Image(label="Input Picture", type="pil")
                 model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="Qwen/Qwen2-VL-7B-Instruct")
                 text_input = gr.Textbox(label="Description of Localization Target")
                 submit_btn = gr.Button(value="Submit")

requirements.txt CHANGED Viewed

@@ -3,6 +3,6 @@ Pillow==10.3.0
 Requests==2.31.0
 torch
 torchvision
-transformers
 accelerate==0.30.0
 qwen-vl-utils

 Requests==2.31.0
 torch
 torchvision
+git+https://github.com/huggingface/transformers.git@main
 accelerate==0.30.0
 qwen-vl-utils