Spaces:
Sleeping
Sleeping
| # models/llm_chat.py | |
| from __future__ import annotations | |
| from typing import List, Dict, Any, Tuple | |
| import os | |
| from utils.config import get_settings | |
| # --- Small, readable menu JSON kept in the system prompt for now --- | |
| MENU_JSON = """ | |
| { | |
| "pizzas": [ | |
| {"name": "Margherita Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 8.5, "medium": 11.0, "large": 13.5}}, | |
| {"name": "Pepperoni Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 9.5, "medium": 12.0, "large": 14.5}} | |
| ], | |
| "salads": [ | |
| {"name": "House Salad", "sizes": ["regular"], "price": {"regular": 6.0}} | |
| ], | |
| "drinks": [ | |
| {"name": "Cola", "sizes": ["can"], "price": {"can": 2.0}} | |
| ], | |
| "hours": "11:00–22:00 daily", | |
| "address": "123 Main St", | |
| "phone": "+1 (555) 010-0000" | |
| } | |
| """ | |
| SYSTEM_PROMPT = f"""You are Marta, the AI call/SMS assistant for FutureCafe. | |
| You talk naturally and help with: | |
| - Menu questions, placing orders, hours/location, and reservations (lightweight). | |
| - If the user asks for pizza/order: list choices from the MENU and ask for missing details (size, quantity, etc.). | |
| - If user provides all details, confirm the order in words (no need to return JSON), include a brief total using MENU prices. | |
| - For hours/location, reply from MENU. | |
| - For unrelated topics, gently steer back to FutureCafe; if the user remains off-topic for 3 turns total, politely end. | |
| - Keep replies concise and friendly. No long explanations. | |
| MENU (JSON you can read from for options & prices): | |
| {MENU_JSON} | |
| """ | |
| # ---------------- llama.cpp singleton ---------------- | |
| _llm = None | |
| def _get_local_llm(): | |
| """Singleton llama.cpp model loader (GGUF).""" | |
| global _llm | |
| if _llm is not None: | |
| return _llm | |
| from llama_cpp import Llama | |
| s = get_settings() | |
| model_path = os.getenv("LLAMACPP_MODEL_PATH", getattr(s, "LLAMACPP_MODEL_PATH", None)) | |
| if not model_path or not os.path.exists(model_path): | |
| raise RuntimeError(f"LLAMACPP_MODEL_PATH not found: {model_path}") | |
| _llm = Llama( | |
| model_path=model_path, | |
| n_ctx=2048, | |
| n_threads=os.cpu_count() or 4, | |
| n_gpu_layers=0, # CPU by default | |
| verbose=False, | |
| ) | |
| return _llm | |
| def _apply_chat_template(messages: List[Dict[str, str]]) -> str: | |
| parts = [] | |
| for m in messages: | |
| role = m.get("role", "user") | |
| content = m.get("content", "") | |
| if role == "system": | |
| parts.append(f"<|system|>\n{content}\n") | |
| elif role == "user": | |
| parts.append(f"<|user|>\n{content}\n") | |
| else: | |
| parts.append(f"<|assistant|>\n{content}\n") | |
| parts.append("<|assistant|>\n") | |
| return "\n".join(parts) | |
| def _generate(messages: List[Dict[str, str]], temperature=0.3, max_tokens=320) -> str: | |
| llm = _get_local_llm() | |
| prompt = _apply_chat_template(messages) | |
| out = llm( | |
| prompt, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=0.9, | |
| repeat_penalty=1.1, | |
| stop=["<|user|>", "<|system|>", "<|assistant|>"], | |
| ) | |
| return (out["choices"][0]["text"] or "").strip() | |
| def respond_chat( | |
| history: List[Dict[str, str]], | |
| user_text: str, | |
| guard_state: Dict[str, Any] | None, | |
| ) -> Tuple[str, Dict[str, Any], Dict[str, Any]]: | |
| """ | |
| LLM-only conversational brain. | |
| Returns: (assistant_text, new_guard_state, diag) | |
| guard_state: {"unrelated": int, "ended": int, "limit": int} | |
| """ | |
| guard = dict(guard_state or {"unrelated": 0, "ended": 0, "limit": 3}) | |
| if guard.get("ended"): | |
| return "(Conversation ended. Start a new chat for FutureCafe.)", guard, {} | |
| msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| if history: | |
| msgs.extend(history[-10:]) | |
| msgs.append({"role": "user", "content": user_text}) | |
| reply = _generate(msgs) | |
| # A super-light off-topic guard without keywords: If the model signals ending, we respect it. | |
| # Otherwise, keep conversation flowing; we do not hard-code keywords or intents here. | |
| # (We still maintain the 'unrelated' counter if you later want to nudge based on signals.) | |
| if "Let’s end" in reply or "Let's end" in reply: | |
| guard["ended"] = 1 | |
| return reply, guard, {} # no tool_result/diagnostics needed for this simpler flow |