# models/llm_chat.py from __future__ import annotations from typing import List, Dict, Any, Tuple import os from utils.config import get_settings # --- Small, readable menu JSON kept in the system prompt for now --- MENU_JSON = """ { "pizzas": [ {"name": "Margherita Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 8.5, "medium": 11.0, "large": 13.5}}, {"name": "Pepperoni Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 9.5, "medium": 12.0, "large": 14.5}} ], "salads": [ {"name": "House Salad", "sizes": ["regular"], "price": {"regular": 6.0}} ], "drinks": [ {"name": "Cola", "sizes": ["can"], "price": {"can": 2.0}} ], "hours": "11:00–22:00 daily", "address": "123 Main St", "phone": "+1 (555) 010-0000" } """ SYSTEM_PROMPT = f"""You are Marta, the AI call/SMS assistant for FutureCafe. You talk naturally and help with: - Menu questions, placing orders, hours/location, and reservations (lightweight). - If the user asks for pizza/order: list choices from the MENU and ask for missing details (size, quantity, etc.). - If user provides all details, confirm the order in words (no need to return JSON), include a brief total using MENU prices. - For hours/location, reply from MENU. - For unrelated topics, gently steer back to FutureCafe; if the user remains off-topic for 3 turns total, politely end. - Keep replies concise and friendly. No long explanations. MENU (JSON you can read from for options & prices): {MENU_JSON} """ # ---------------- llama.cpp singleton ---------------- _llm = None def _get_local_llm(): """Singleton llama.cpp model loader (GGUF).""" global _llm if _llm is not None: return _llm from llama_cpp import Llama s = get_settings() model_path = os.getenv("LLAMACPP_MODEL_PATH", getattr(s, "LLAMACPP_MODEL_PATH", None)) if not model_path or not os.path.exists(model_path): raise RuntimeError(f"LLAMACPP_MODEL_PATH not found: {model_path}") _llm = Llama( model_path=model_path, n_ctx=2048, n_threads=os.cpu_count() or 4, n_gpu_layers=0, # CPU by default verbose=False, ) return _llm def _apply_chat_template(messages: List[Dict[str, str]]) -> str: parts = [] for m in messages: role = m.get("role", "user") content = m.get("content", "") if role == "system": parts.append(f"<|system|>\n{content}\n") elif role == "user": parts.append(f"<|user|>\n{content}\n") else: parts.append(f"<|assistant|>\n{content}\n") parts.append("<|assistant|>\n") return "\n".join(parts) def _generate(messages: List[Dict[str, str]], temperature=0.3, max_tokens=320) -> str: llm = _get_local_llm() prompt = _apply_chat_template(messages) out = llm( prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.9, repeat_penalty=1.1, stop=["<|user|>", "<|system|>", "<|assistant|>"], ) return (out["choices"][0]["text"] or "").strip() def respond_chat( history: List[Dict[str, str]], user_text: str, guard_state: Dict[str, Any] | None, ) -> Tuple[str, Dict[str, Any], Dict[str, Any]]: """ LLM-only conversational brain. Returns: (assistant_text, new_guard_state, diag) guard_state: {"unrelated": int, "ended": int, "limit": int} """ guard = dict(guard_state or {"unrelated": 0, "ended": 0, "limit": 3}) if guard.get("ended"): return "(Conversation ended. Start a new chat for FutureCafe.)", guard, {} msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}] if history: msgs.extend(history[-10:]) msgs.append({"role": "user", "content": user_text}) reply = _generate(msgs) # A super-light off-topic guard without keywords: If the model signals ending, we respect it. # Otherwise, keep conversation flowing; we do not hard-code keywords or intents here. # (We still maintain the 'unrelated' counter if you later want to nudge based on signals.) if "Let’s end" in reply or "Let's end" in reply: guard["ended"] = 1 return reply, guard, {} # no tool_result/diagnostics needed for this simpler flow