akhaliq HF Staff commited on
Commit
51b8121
Β·
verified Β·
1 Parent(s): d30dbca

Update Gradio app with multiple files

Browse files
Files changed (2) hide show
  1. app.py +118 -105
  2. requirements.txt +2 -0
app.py CHANGED
@@ -2,9 +2,6 @@ import gradio as gr
2
  from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
3
  import torch
4
  from PIL import Image
5
- import io
6
- import base64
7
-
8
  import spaces
9
 
10
  # Load model and processor
@@ -15,61 +12,71 @@ model = Qwen3VLForConditionalGeneration.from_pretrained(
15
  )
16
  processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
17
 
18
- def process_image(image):
19
- """Convert image to base64 string for processing"""
20
- if isinstance(image, str):
21
- return image
22
- if isinstance(image, Image.Image):
23
- buffered = io.BytesIO()
24
- image.save(buffered, format="PNG")
25
- img_str = base64.b64encode(buffered.getvalue()).decode()
26
- return f"data:image/png;base64,{img_str}"
27
- return image
28
-
29
  @spaces.GPU(duration=120)
30
- def qwen_chat(message, image, chat_history):
31
  """
32
- Process chat message with optional image input
33
 
34
  Args:
35
- message (str): User's text message
36
- image: Optional image input
37
- chat_history (list): Previous conversation history
38
 
39
  Returns:
40
- tuple: Updated chat history and empty message input
41
  """
42
- if not message and image is None:
43
- return chat_history, ""
 
44
 
45
- # Build messages list
46
  messages = []
47
 
48
  # Add previous chat history
49
- for user_msg, assistant_msg in chat_history:
50
- messages.append({"role": "user", "content": [{"type": "text", "text": user_msg}]})
51
- messages.append({"role": "assistant", "content": [{"type": "text", "text": assistant_msg}]})
52
-
53
- # Add current message with optional image
 
 
 
 
 
 
 
 
54
  current_content = []
55
- if image is not None:
56
- current_content.append({
57
- "type": "image",
58
- "image": image
59
- })
60
 
61
- if message:
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  current_content.append({
63
  "type": "text",
64
- "text": message
65
  })
66
 
 
 
 
 
 
67
  messages.append({
68
  "role": "user",
69
  "content": current_content
70
  })
71
 
72
- # Prepare inputs
73
  inputs = processor.apply_chat_template(
74
  messages,
75
  tokenize=True,
@@ -81,7 +88,13 @@ def qwen_chat(message, image, chat_history):
81
 
82
  # Generate response
83
  with torch.no_grad():
84
- generated_ids = model.generate(**inputs, max_new_tokens=256)
 
 
 
 
 
 
85
 
86
  # Decode output
87
  generated_ids_trimmed = [
@@ -93,80 +106,80 @@ def qwen_chat(message, image, chat_history):
93
  clean_up_tokenization_spaces=False
94
  )[0]
95
 
96
- # Update chat history
97
- chat_history.append((message if message else "[Image provided]", output_text))
98
-
99
- return chat_history, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Create Gradio interface
102
- with gr.Blocks(title="Qwen3-VL Chat") as demo:
103
  gr.Markdown(
104
  """
105
- # 🎨 Qwen3-VL Chat
106
- Chat with Qwen3-VL-2B-Instruct - A multimodal AI that can understand both text and images!
 
 
 
 
107
 
108
- [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
 
 
 
 
109
  """
110
  )
111
-
112
- with gr.Row():
113
- with gr.Column(scale=3):
114
- chatbot = gr.Chatbot(
115
- label="Chat History",
116
- type="messages",
117
- height=600,
118
- show_copy_button=True
119
- )
120
-
121
- with gr.Column(scale=1):
122
- image_input = gr.Image(
123
- label="Upload Image (Optional)",
124
- type="pil",
125
- sources=["upload", "clipboard"],
126
- interactive=True
127
- )
128
-
129
- with gr.Row():
130
- message_input = gr.Textbox(
131
- label="Message",
132
- placeholder="Type your message here...",
133
- lines=2,
134
- scale=4
135
- )
136
- send_btn = gr.Button("Send", scale=1, variant="primary")
137
-
138
- with gr.Row():
139
- clear_btn = gr.Button("Clear Chat", variant="secondary")
140
-
141
- gr.Markdown(
142
- """
143
- ### Tips:
144
- - Upload an image to ask questions about it
145
- - Describe what you see or ask for analysis
146
- - The model can answer questions about images and text
147
- """
148
- )
149
-
150
- # Event handlers
151
- def send_message(msg, img, history):
152
- return qwen_chat(msg, img, history)
153
-
154
- send_btn.click(
155
- send_message,
156
- inputs=[message_input, image_input, chatbot],
157
- outputs=[chatbot, message_input]
158
- )
159
-
160
- message_input.submit(
161
- send_message,
162
- inputs=[message_input, image_input, chatbot],
163
- outputs=[chatbot, message_input]
164
- )
165
-
166
- clear_btn.click(
167
- lambda: ([], None, ""),
168
- outputs=[chatbot, image_input, message_input]
169
- )
170
 
171
  if __name__ == "__main__":
172
  demo.launch(share=False)
 
2
  from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
3
  import torch
4
  from PIL import Image
 
 
 
5
  import spaces
6
 
7
  # Load model and processor
 
12
  )
13
  processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")
14
 
 
 
 
 
 
 
 
 
 
 
 
15
  @spaces.GPU(duration=120)
16
+ def qwen_chat_fn(message, history):
17
  """
18
+ Process chat messages with multimodal support
19
 
20
  Args:
21
+ message (dict): Contains 'text' and 'files' keys
22
+ history (list): Chat history in messages format
 
23
 
24
  Returns:
25
+ str: Assistant response
26
  """
27
+ # Extract text and files from the message
28
+ text = message.get("text", "")
29
+ files = message.get("files", [])
30
 
31
+ # Build messages list for the model
32
  messages = []
33
 
34
  # Add previous chat history
35
+ for hist_item in history:
36
+ if hist_item["role"] == "user":
37
+ messages.append({
38
+ "role": "user",
39
+ "content": [{"type": "text", "text": hist_item["content"]}]
40
+ })
41
+ elif hist_item["role"] == "assistant":
42
+ messages.append({
43
+ "role": "assistant",
44
+ "content": [{"type": "text", "text": hist_item["content"]}]
45
+ })
46
+
47
+ # Build current message content
48
  current_content = []
 
 
 
 
 
49
 
50
+ # Add images if provided
51
+ if files:
52
+ for file_path in files:
53
+ try:
54
+ image = Image.open(file_path)
55
+ current_content.append({
56
+ "type": "image",
57
+ "image": image
58
+ })
59
+ except Exception as e:
60
+ print(f"Error loading image {file_path}: {e}")
61
+
62
+ # Add text
63
+ if text:
64
  current_content.append({
65
  "type": "text",
66
+ "text": text
67
  })
68
 
69
+ # If no content, return empty
70
+ if not current_content:
71
+ return ""
72
+
73
+ # Add current message
74
  messages.append({
75
  "role": "user",
76
  "content": current_content
77
  })
78
 
79
+ # Prepare inputs for the model
80
  inputs = processor.apply_chat_template(
81
  messages,
82
  tokenize=True,
 
88
 
89
  # Generate response
90
  with torch.no_grad():
91
+ generated_ids = model.generate(
92
+ **inputs,
93
+ max_new_tokens=512,
94
+ temperature=0.7,
95
+ top_p=0.95,
96
+ do_sample=True
97
+ )
98
 
99
  # Decode output
100
  generated_ids_trimmed = [
 
106
  clean_up_tokenization_spaces=False
107
  )[0]
108
 
109
+ return output_text
110
+
111
+ # Example messages for demonstration
112
+ example_messages = [
113
+ {"text": "Hello! Can you describe what makes a good photograph?", "files": []},
114
+ {"text": "What's the weather like in this image?", "files": []},
115
+ {"text": "Can you analyze the composition of this picture?", "files": []},
116
+ ]
117
+
118
+ # Create the ChatInterface
119
+ demo = gr.ChatInterface(
120
+ fn=qwen_chat_fn,
121
+ type="messages",
122
+ multimodal=True,
123
+ title="🎨 Qwen3-VL Multimodal Chat",
124
+ description="""
125
+ Chat with Qwen3-VL-2B-Instruct - A powerful multimodal AI that understands both text and images!
126
+
127
+ **Features:**
128
+ - πŸ“ Text conversations
129
+ - πŸ–ΌοΈ Image understanding and analysis
130
+ - 🎯 Visual question answering
131
+ - πŸ” Detailed image descriptions
132
+
133
+ **How to use:**
134
+ - Type your message in the text box
135
+ - Click the attachment button to upload images
136
+ - Send your message to get AI responses
137
+
138
+ [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
139
+ """,
140
+ examples=[
141
+ {"text": "Hello! What can you help me with today?", "files": []},
142
+ {"text": "Can you explain what machine learning is?", "files": []},
143
+ {"text": "What are the key elements of good design?", "files": []},
144
+ ],
145
+ theme=gr.themes.Soft(),
146
+ autofocus=True,
147
+ submit_btn="Send",
148
+ stop_btn="Stop",
149
+ retry_btn="πŸ”„ Retry",
150
+ undo_btn="↩️ Undo",
151
+ clear_btn="πŸ—‘οΈ Clear",
152
+ additional_inputs=None,
153
+ additional_inputs_accordion=None,
154
+ cache_examples=False,
155
+ analytics_enabled=False,
156
+ css="""
157
+ .contain { max-width: 1200px; margin: auto; }
158
+ .message { font-size: 14px; }
159
+ footer { display: none !important; }
160
+ """,
161
+ fill_height=True,
162
+ concurrency_limit=10
163
+ )
164
 
165
+ # Add additional information in a Markdown block
166
+ with demo:
167
  gr.Markdown(
168
  """
169
+ ---
170
+ ### πŸ’‘ Tips for Best Results:
171
+ - **For images:** Upload clear, well-lit images for better analysis
172
+ - **For questions:** Be specific about what you want to know
173
+ - **Context matters:** Provide relevant context for more accurate responses
174
+ - **Multiple images:** You can upload multiple images in a single message
175
 
176
+ ### πŸš€ Model Information:
177
+ - **Model:** Qwen3-VL-2B-Instruct
178
+ - **Parameters:** 2 Billion
179
+ - **Capabilities:** Image understanding, OCR, visual reasoning, general conversation
180
+ - **Powered by:** Hugging Face Spaces with ZeroGPU
181
  """
182
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
  if __name__ == "__main__":
185
  demo.launch(share=False)
requirements.txt CHANGED
@@ -5,3 +5,5 @@ torchvision
5
  pillow
6
  accelerate
7
  spaces
 
 
 
5
  pillow
6
  accelerate
7
  spaces
8
+ sentencepiece
9
+ qwen-vl-utils