import gradio as gr from gradio import ChatMessage from openai import OpenAI import time # Configure Lemonade Server connection base_url = "http://localhost:8000/api/v1" client = OpenAI( base_url=base_url, api_key="lemonade", # required, but unused in Lemonade ) def stream_chat_response(message: str, history: list, model_name: str, system_prompt: str): """ Stream responses from Lemonade Server and display thinking process separately. """ # Add user message to history history.append(ChatMessage(role="user", content=message)) yield history # Convert history to OpenAI format - only include actual conversation messages messages = [] # Add system prompt if provided if system_prompt and system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) # Convert history, skipping metadata-only messages for msg in history: if isinstance(msg, ChatMessage): # Skip thinking/metadata messages when sending to API if msg.metadata and msg.metadata.get("title"): continue messages.append({ "role": msg.role, "content": msg.content }) elif isinstance(msg, dict): # Skip metadata messages if msg.get("metadata"): continue messages.append({ "role": msg.get("role", "user"), "content": msg.get("content", "") }) try: # Initialize response tracking thinking_content = "" response_content = "" thinking_added = False response_added = False thinking_start_time = None # Stream response from Lemonade Server stream = client.chat.completions.create( model=model_name, messages=messages, stream=True, max_tokens=2048, temperature=0.7, ) for chunk in stream: # Safety check for chunk structure if not chunk.choices or len(chunk.choices) == 0: continue if not hasattr(chunk.choices[0], 'delta'): continue delta = chunk.choices[0].delta # Check for reasoning_content (thinking process) reasoning_content = getattr(delta, 'reasoning_content', None) # Check for regular content (final answer) content = getattr(delta, 'content', None) # Handle reasoning/thinking content if reasoning_content: if not thinking_added: # Add thinking section thinking_start_time = time.time() history.append(ChatMessage( role="assistant", content="", metadata={ "title": "🧠 Thought Process", "status": "pending" } )) thinking_added = True # Accumulate thinking content thinking_content += reasoning_content history[-1] = ChatMessage( role="assistant", content=thinking_content, metadata={ "title": "🧠 Thought Process", "status": "pending" } ) yield history # Handle regular content (final answer) elif content: # Finalize thinking section if it exists if thinking_added and thinking_start_time: elapsed = time.time() - thinking_start_time # Update the thinking message to "done" status for i in range(len(history) - 1, -1, -1): if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process": history[i] = ChatMessage( role="assistant", content=thinking_content, metadata={ "title": "🧠 Thought Process", "status": "done", "duration": elapsed } ) break thinking_start_time = None # Add or update response content if not response_added: history.append(ChatMessage( role="assistant", content="" )) response_added = True response_content += content history[-1] = ChatMessage( role="assistant", content=response_content ) yield history # Final check: if thinking section exists but wasn't finalized if thinking_added and thinking_start_time: elapsed = time.time() - thinking_start_time for i in range(len(history) - 1, -1, -1): if isinstance(history[i], ChatMessage) and history[i].metadata and history[i].metadata.get("title") == "🧠 Thought Process": history[i] = ChatMessage( role="assistant", content=thinking_content, metadata={ "title": "🧠 Thought Process", "status": "done", "duration": elapsed } ) break yield history except Exception as e: import traceback error_msg = str(e) error_trace = traceback.format_exc() # Try to extract more details from the error if "422" in error_msg: error_details = f""" ⚠️ **Request Validation Error** The server rejected the request. Possible issues: - Model name might be incorrect (currently: `{model_name}`) - Check that the model is loaded on the server - Try simplifying the system prompt **Error:** {error_msg} """ elif "list index out of range" in error_msg or "IndexError" in error_trace: error_details = f""" ⚠️ **Streaming Response Error** There was an issue processing the streaming response. **Debug Info:** - Model: `{model_name}` - Base URL: `{base_url}` - Error: {error_msg} Try refreshing and sending another message. """ else: error_details = f""" ⚠️ **Connection Error** Error: {error_msg} Make sure: 1. Lemonade Server is running at `{base_url}` 2. Model `{model_name}` is loaded 3. The server is accessible **Debug trace:** ``` {error_trace[-500:]} ``` """ history.append(ChatMessage( role="assistant", content=error_details, metadata={ "title": "⚠️ Error Details" } )) yield history def clear_chat(): """Clear the chat history.""" return [] # Build the Gradio interface with gr.Blocks(theme=gr.themes.Ocean()) as demo: # Define input textbox first so it can be referenced in Examples msg = gr.Textbox( placeholder="Type your message here and press Enter...", show_label=False, container=False, render=False # Don't render yet, will be rendered in main area ) # Sidebar for settings and information with gr.Sidebar(position="left", open=True): gr.Markdown(""" # 🍋 Lemonade Reasoning Chatbot Chat with local LLMs running on AMD Lemonade Server. This interface beautifully displays the model's thinking process! """) gr.Markdown("### ⚙️ Settings") model_dropdown = gr.Dropdown( choices=[ "Qwen3-0.6B-GGUF", "Llama-3.1-8B-Instruct-Hybrid", "Qwen2.5-7B-Instruct", "Phi-3.5-mini-instruct", "Meta-Llama-3-8B-Instruct" ], value="Qwen3-0.6B-GGUF", label="Model", info="Select the LLM model to use", allow_custom_value=True ) system_prompt = gr.Textbox( label="System Prompt (Optional)", value="You are a helpful assistant.", lines=3, info="Customize the model's behavior", placeholder="Leave empty to use model defaults" ) # How Thinking Works Accordion with gr.Accordion("💡 How Thinking Works", open=False): gr.Markdown(""" - Reasoning models output `reasoning_content` (thinking) and `content` (final answer) separately - Thinking appears in a collapsible "🧠 Thought Process" section - Duration of thinking is displayed automatically - Works with models like: DeepSeek-R1, QwQ, and other reasoning models """) # Current Model Accordion with gr.Accordion("📋 Current Model", open=False): gr.Markdown(""" Make sure your model supports reasoning output for thinking to be displayed. """) # Example Prompts Accordion with gr.Accordion("📝 Example Prompts", open=False): gr.Markdown(""" - "Solve: If a train travels 120 km in 2 hours, what's its speed?" - "Compare pros and cons of electric vs gas cars" - "Explain step-by-step how to make coffee" - "What's the difference between AI and ML?" """) # Add example interactions in sidebar gr.Examples( examples=[ "What is 15 + 24?", "Write a short poem about AI", "What is the capital of Japan?", "Explain what machine learning is in simple terms" ], inputs=msg, label="Quick Examples" ) # Main chat area - full screen chatbot = gr.Chatbot( type="messages", label="Chat", height="calc(100vh - 200px)", avatar_images=( "https://em-content.zobj.net/source/twitter/376/bust-in-silhouette_1f464.png", "https://em-content.zobj.net/source/twitter/376/robot_1f916.png" ), show_label=False, #placeholder="C:\Users\Yuvi\dev\testing\placeholder.png" placeholder= #""" #
# Placeholder #
#""" """
""" ) # Render the input textbox in main area msg.render() # Event handlers - only submit event def submit_message(message, history, model, sys_prompt): """Wrapper to handle message submission""" if not message or message.strip() == "": return history, "" yield from stream_chat_response(message, history, model, sys_prompt) msg.submit( submit_message, inputs=[msg, chatbot, model_dropdown, system_prompt], outputs=chatbot ).then( lambda: "", None, msg ) # Launch the app if __name__ == "__main__": demo.launch(allowed_paths=["."], ssr_mode=True)