ysharma's picture
ysharma HF Staff
Create app.py
78752d9 verified
raw
history blame
12 kB
import gradio as gr
from gradio import ChatMessage
from openai import OpenAI
import time
# Configure Lemonade Server connection
base_url = "http://localhost:8000/api/v1"
client = OpenAI(
base_url=base_url,
api_key="lemonade", # required, but unused in Lemonade
)
def stream_chat_response(message: str, history: list, model_name: str, system_prompt: str):
"""
Stream responses from Lemonade Server and display thinking process separately.
"""
# Add user message to history
history.append(ChatMessage(role="user", content=message))
yield history
# Convert history to OpenAI format - only include actual conversation messages
messages = []
# Add system prompt if provided
if system_prompt and system_prompt.strip():
messages.append({"role": "system", "content": system_prompt})
# Convert history, skipping metadata-only messages
for msg in history:
if isinstance(msg, ChatMessage):
# Skip thinking/metadata messages when sending to API
if msg.metadata and msg.metadata.get("title"):
continue
messages.append({
"role": msg.role,
"content": msg.content
})
elif isinstance(msg, dict):
# Skip metadata messages
if msg.get("metadata"):
continue
messages.append({
"role": msg.get("role", "user"),
"content": msg.get("content", "")
})
try:
# Initialize response tracking
thinking_content = ""
response_content = ""
thinking_added = False
response_added = False
thinking_start_time = None
# Stream response from Lemonade Server
stream = client.chat.completions.create(
model=model_name,
messages=messages,
stream=True,
max_tokens=2048,
temperature=0.7,
)
for chunk in stream:
# Safety check for chunk structure
if not chunk.choices or len(chunk.choices) == 0:
continue
if not hasattr(chunk.choices[0], 'delta'):
continue
delta = chunk.choices[0].delta
# Check for reasoning_content (thinking process)
reasoning_content = getattr(delta, 'reasoning_content', None)
# Check for regular content (final answer)
content = getattr(delta, 'content', None)
# Handle reasoning/thinking content
if reasoning_content:
if not thinking_added:
# Add thinking section
thinking_start_time = time.time()
history.append(ChatMessage(
role="assistant",
content="",
metadata={
"title": "🧠 Thought Process",
"status": "pending"
}
))
thinking_added = True
# Accumulate thinking content
thinking_content += reasoning_content
history[-1] = ChatMessage(
role="assistant",
content=thinking_content,
metadata={
"title": "🧠 Thought Process",
"status": "pending"
}
)
yield history
# Handle regular content (final answer)
elif content:
# Finalize thinking section if it exists
if thinking_added and thinking_start_time:
elapsed = time.time() - thinking_start_time
# Update the thinking message to "done" status
for i in range(len(history) - 1, -1, -1):
if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process":
history[i] = ChatMessage(
role="assistant",
content=thinking_content,
metadata={
"title": "🧠 Thought Process",
"status": "done",
"duration": elapsed
}
)
break
thinking_start_time = None
# Add or update response content
if not response_added:
history.append(ChatMessage(
role="assistant",
content=""
))
response_added = True
response_content += content
history[-1] = ChatMessage(
role="assistant",
content=response_content
)
yield history
# Final check: if thinking section exists but wasn't finalized
if thinking_added and thinking_start_time:
elapsed = time.time() - thinking_start_time
for i in range(len(history) - 1, -1, -1):
if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process":
history[i] = ChatMessage(
role="assistant",
content=thinking_content,
metadata={
"title": "🧠 Thought Process",
"status": "done",
"duration": elapsed
}
)
break
yield history
except Exception as e:
import traceback
error_msg = str(e)
error_trace = traceback.format_exc()
# Try to extract more details from the error
if "422" in error_msg:
error_details = f"""
⚠️ **Request Validation Error**
The server rejected the request. Possible issues:
- Model name might be incorrect (currently: `{model_name}`)
- Check that the model is loaded on the server
- Try simplifying the system prompt
**Error:** {error_msg}
"""
elif "list index out of range" in error_msg or "IndexError" in error_trace:
error_details = f"""
⚠️ **Streaming Response Error**
There was an issue processing the streaming response.
**Debug Info:**
- Model: `{model_name}`
- Base URL: `{base_url}`
- Error: {error_msg}
Try refreshing and sending another message.
"""
else:
error_details = f"""
⚠️ **Connection Error**
Error: {error_msg}
Make sure:
1. Lemonade Server is running at `{base_url}`
2. Model `{model_name}` is loaded
3. The server is accessible
**Debug trace:**
```
{error_trace[-500:]}
```
"""
history.append(ChatMessage(
role="assistant",
content=error_details,
metadata={
"title": "⚠️ Error Details"
}
))
yield history
def clear_chat():
"""Clear the chat history."""
return []
# Build the Gradio interface
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
# Define input textbox first so it can be referenced in Examples
msg = gr.Textbox(
placeholder="Type your message here and press Enter...",
show_label=False,
container=False,
render=False # Don't render yet, will be rendered in main area
)
# Sidebar for settings and information
with gr.Sidebar(position="left", open=True):
gr.Markdown("""
# πŸ‹ Lemonade Reasoning Chatbot
Chat with local LLMs running on AMD Lemonade Server. This interface beautifully displays the model's thinking process!
""")
gr.Markdown("### βš™οΈ Settings")
model_dropdown = gr.Dropdown(
choices=[
"Qwen3-0.6B-GGUF",
"Llama-3.1-8B-Instruct-Hybrid",
"Qwen2.5-7B-Instruct",
"Phi-3.5-mini-instruct",
"Meta-Llama-3-8B-Instruct"
],
value="Qwen3-0.6B-GGUF",
label="Model",
info="Select the LLM model to use",
allow_custom_value=True
)
system_prompt = gr.Textbox(
label="System Prompt (Optional)",
value="You are a helpful assistant.",
lines=3,
info="Customize the model's behavior",
placeholder="Leave empty to use model defaults"
)
# How Thinking Works Accordion
with gr.Accordion("πŸ’‘ How Thinking Works", open=False):
gr.Markdown("""
- Reasoning models output `reasoning_content` (thinking) and `content` (final answer) separately
- Thinking appears in a collapsible "🧠 Thought Process" section
- Duration of thinking is displayed automatically
- Works with models like: DeepSeek-R1, QwQ, and other reasoning models
""")
# Current Model Accordion
with gr.Accordion("πŸ“‹ Current Model", open=False):
gr.Markdown("""
Make sure your model supports reasoning output for thinking to be displayed.
""")
# Example Prompts Accordion
with gr.Accordion("πŸ“ Example Prompts", open=False):
gr.Markdown("""
- "Solve: If a train travels 120 km in 2 hours, what's its speed?"
- "Compare pros and cons of electric vs gas cars"
- "Explain step-by-step how to make coffee"
- "What's the difference between AI and ML?"
""")
# Add example interactions in sidebar
gr.Examples(
examples=[
"What is 15 + 24?",
"Write a short poem about AI",
"What is the capital of Japan?",
"Explain what machine learning is in simple terms"
],
inputs=msg,
label="Quick Examples"
)
# Main chat area - full screen
chatbot = gr.Chatbot(
type="messages",
label="Chat",
height="calc(100vh - 200px)",
avatar_images=(
"https://em-content.zobj.net/source/twitter/376/bust-in-silhouette_1f464.png",
"https://em-content.zobj.net/source/twitter/376/robot_1f916.png"
),
show_label=False,
#placeholder="C:\Users\Yuvi\dev\testing\placeholder.png"
placeholder= #"""
#<div style="display: flex; justify-content: center; align-items: center; height: 100%;">
# <img src="/gradio_api/file=C:\\Users\\Yuvi\\dev\\testing\\placeholder.png" style="opacity: 0.4; max-width: 80%; max-height: 80%; object-fit: contain;" alt="Placeholder">
#</div>
#"""
"""<div>
<img src="/gradio_api/file=placeholder.png">
</div>"""
)
# Render the input textbox in main area
msg.render()
# Event handlers - only submit event
def submit_message(message, history, model, sys_prompt):
"""Wrapper to handle message submission"""
if not message or message.strip() == "":
return history, ""
yield from stream_chat_response(message, history, model, sys_prompt)
msg.submit(
submit_message,
inputs=[msg, chatbot, model_dropdown, system_prompt],
outputs=chatbot
).then(
lambda: "",
None,
msg
)
# Launch the app
if __name__ == "__main__":
demo.launch(allowed_paths=["."], ssr_mode=True)