Spaces:

ysharma
/

lemonade-thinking-chatbot

Runtime error

App Files Files Community

lemonade-thinking-chatbot / app.py

ysharma HF Staff

Create app.py

78752d9 verified about 1 month ago

raw

history blame

12 kB

	import gradio as gr
	from gradio import ChatMessage
	from openai import OpenAI
	import time

	# Configure Lemonade Server connection
	base_url = "http://localhost:8000/api/v1"
	client = OpenAI(
	base_url=base_url,
	api_key="lemonade", # required, but unused in Lemonade
	)

	def stream_chat_response(message: str, history: list, model_name: str, system_prompt: str):
	"""
	Stream responses from Lemonade Server and display thinking process separately.
	"""
	# Add user message to history
	history.append(ChatMessage(role="user", content=message))
	yield history

	# Convert history to OpenAI format - only include actual conversation messages
	messages = []

	# Add system prompt if provided
	if system_prompt and system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt})

	# Convert history, skipping metadata-only messages
	for msg in history:
	if isinstance(msg, ChatMessage):
	# Skip thinking/metadata messages when sending to API
	if msg.metadata and msg.metadata.get("title"):
	continue
	messages.append({
	"role": msg.role,
	"content": msg.content
	})
	elif isinstance(msg, dict):
	# Skip metadata messages
	if msg.get("metadata"):
	continue
	messages.append({
	"role": msg.get("role", "user"),
	"content": msg.get("content", "")
	})

	try:
	# Initialize response tracking
	thinking_content = ""
	response_content = ""
	thinking_added = False
	response_added = False
	thinking_start_time = None

	# Stream response from Lemonade Server
	stream = client.chat.completions.create(
	model=model_name,
	messages=messages,
	stream=True,
	max_tokens=2048,
	temperature=0.7,
	)

	for chunk in stream:
	# Safety check for chunk structure
	if not chunk.choices or len(chunk.choices) == 0:
	continue

	if not hasattr(chunk.choices[0], 'delta'):
	continue

	delta = chunk.choices[0].delta

	# Check for reasoning_content (thinking process)
	reasoning_content = getattr(delta, 'reasoning_content', None)
	# Check for regular content (final answer)
	content = getattr(delta, 'content', None)

	# Handle reasoning/thinking content
	if reasoning_content:
	if not thinking_added:
	# Add thinking section
	thinking_start_time = time.time()
	history.append(ChatMessage(
	role="assistant",
	content="",
	metadata={
	"title": "🧠 Thought Process",
	"status": "pending"
	}
	))
	thinking_added = True

	# Accumulate thinking content
	thinking_content += reasoning_content
	history[-1] = ChatMessage(
	role="assistant",
	content=thinking_content,
	metadata={
	"title": "🧠 Thought Process",
	"status": "pending"
	}
	)
	yield history

	# Handle regular content (final answer)
	elif content:
	# Finalize thinking section if it exists
	if thinking_added and thinking_start_time:
	elapsed = time.time() - thinking_start_time
	# Update the thinking message to "done" status
	for i in range(len(history) - 1, -1, -1):
	if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process":
	history[i] = ChatMessage(
	role="assistant",
	content=thinking_content,
	metadata={
	"title": "🧠 Thought Process",
	"status": "done",
	"duration": elapsed
	}
	)
	break
	thinking_start_time = None

	# Add or update response content
	if not response_added:
	history.append(ChatMessage(
	role="assistant",
	content=""
	))
	response_added = True

	response_content += content
	history[-1] = ChatMessage(
	role="assistant",
	content=response_content
	)
	yield history

	# Final check: if thinking section exists but wasn't finalized
	if thinking_added and thinking_start_time:
	elapsed = time.time() - thinking_start_time
	for i in range(len(history) - 1, -1, -1):
	if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process":
	history[i] = ChatMessage(
	role="assistant",
	content=thinking_content,
	metadata={
	"title": "🧠 Thought Process",
	"status": "done",
	"duration": elapsed
	}
	)
	break
	yield history

	except Exception as e:
	import traceback
	error_msg = str(e)
	error_trace = traceback.format_exc()

	# Try to extract more details from the error
	if "422" in error_msg:
	error_details = f"""
	⚠️ Request Validation Error

	The server rejected the request. Possible issues:
	- Model name might be incorrect (currently: `{model_name}`)
	- Check that the model is loaded on the server
	- Try simplifying the system prompt

	Error: {error_msg}
	"""
	elif "list index out of range" in error_msg or "IndexError" in error_trace:
	error_details = f"""
	⚠️ Streaming Response Error

	There was an issue processing the streaming response.

	Debug Info:
	- Model: `{model_name}`
	- Base URL: `{base_url}`
	- Error: {error_msg}

	Try refreshing and sending another message.
	"""
	else:
	error_details = f"""
	⚠️ Connection Error

	Error: {error_msg}

	Make sure:
	1. Lemonade Server is running at `{base_url}`
	2. Model `{model_name}` is loaded
	3. The server is accessible

	Debug trace:
	```
	{error_trace[-500:]}
	```
	"""

	history.append(ChatMessage(
	role="assistant",
	content=error_details,
	metadata={
	"title": "⚠️ Error Details"
	}
	))
	yield history


	def clear_chat():
	"""Clear the chat history."""
	return []


	# Build the Gradio interface
	with gr.Blocks(theme=gr.themes.Ocean()) as demo:
	# Define input textbox first so it can be referenced in Examples
	msg = gr.Textbox(
	placeholder="Type your message here and press Enter...",
	show_label=False,
	container=False,
	render=False # Don't render yet, will be rendered in main area
	)

	# Sidebar for settings and information
	with gr.Sidebar(position="left", open=True):
	gr.Markdown("""
	# 🍋 Lemonade Reasoning Chatbot
	Chat with local LLMs running on AMD Lemonade Server. This interface beautifully displays the model's thinking process!
	""")

	gr.Markdown("### ⚙️ Settings")

	model_dropdown = gr.Dropdown(
	choices=[
	"Qwen3-0.6B-GGUF",
	"Llama-3.1-8B-Instruct-Hybrid",
	"Qwen2.5-7B-Instruct",
	"Phi-3.5-mini-instruct",
	"Meta-Llama-3-8B-Instruct"
	],
	value="Qwen3-0.6B-GGUF",
	label="Model",
	info="Select the LLM model to use",
	allow_custom_value=True
	)

	system_prompt = gr.Textbox(
	label="System Prompt (Optional)",
	value="You are a helpful assistant.",
	lines=3,
	info="Customize the model's behavior",
	placeholder="Leave empty to use model defaults"
	)

	# How Thinking Works Accordion
	with gr.Accordion("💡 How Thinking Works", open=False):
	gr.Markdown("""
	- Reasoning models output `reasoning_content` (thinking) and `content` (final answer) separately
	- Thinking appears in a collapsible "🧠 Thought Process" section
	- Duration of thinking is displayed automatically
	- Works with models like: DeepSeek-R1, QwQ, and other reasoning models
	""")

	# Current Model Accordion
	with gr.Accordion("📋 Current Model", open=False):
	gr.Markdown("""
	Make sure your model supports reasoning output for thinking to be displayed.
	""")

	# Example Prompts Accordion
	with gr.Accordion("📝 Example Prompts", open=False):
	gr.Markdown("""
	- "Solve: If a train travels 120 km in 2 hours, what's its speed?"
	- "Compare pros and cons of electric vs gas cars"
	- "Explain step-by-step how to make coffee"
	- "What's the difference between AI and ML?"
	""")

	# Add example interactions in sidebar
	gr.Examples(
	examples=[
	"What is 15 + 24?",
	"Write a short poem about AI",
	"What is the capital of Japan?",
	"Explain what machine learning is in simple terms"
	],
	inputs=msg,
	label="Quick Examples"
	)

	# Main chat area - full screen
	chatbot = gr.Chatbot(
	type="messages",
	label="Chat",
	height="calc(100vh - 200px)",
	avatar_images=(
	"https://em-content.zobj.net/source/twitter/376/bust-in-silhouette_1f464.png",
	"https://em-content.zobj.net/source/twitter/376/robot_1f916.png"
	),
	show_label=False,
	#placeholder="C:\Users\Yuvi\dev\testing\placeholder.png"
	placeholder= #"""
	#<div style="display: flex; justify-content: center; align-items: center; height: 100%;">
	# <img src="/gradio_api/file=C:\\Users\\Yuvi\\dev\\testing\\placeholder.png" style="opacity: 0.4; max-width: 80%; max-height: 80%; object-fit: contain;" alt="Placeholder">
	#</div>
	#"""
	"""<div>
	<img src="/gradio_api/file=placeholder.png">
	</div>"""
	)

	# Render the input textbox in main area
	msg.render()

	# Event handlers - only submit event
	def submit_message(message, history, model, sys_prompt):
	"""Wrapper to handle message submission"""
	if not message or message.strip() == "":
	return history, ""
	yield from stream_chat_response(message, history, model, sys_prompt)

	msg.submit(
	submit_message,
	inputs=[msg, chatbot, model_dropdown, system_prompt],
	outputs=chatbot
	).then(
	lambda: "",
	None,
	msg
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(allowed_paths=["."], ssr_mode=True)