Eyob-Sol's picture
Upload 38 files
74bb5fe verified
raw
history blame
4.27 kB
# models/llm_chat.py
from __future__ import annotations
from typing import List, Dict, Any, Tuple
import os
from utils.config import get_settings
# --- Small, readable menu JSON kept in the system prompt for now ---
MENU_JSON = """
{
"pizzas": [
{"name": "Margherita Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 8.5, "medium": 11.0, "large": 13.5}},
{"name": "Pepperoni Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 9.5, "medium": 12.0, "large": 14.5}}
],
"salads": [
{"name": "House Salad", "sizes": ["regular"], "price": {"regular": 6.0}}
],
"drinks": [
{"name": "Cola", "sizes": ["can"], "price": {"can": 2.0}}
],
"hours": "11:00–22:00 daily",
"address": "123 Main St",
"phone": "+1 (555) 010-0000"
}
"""
SYSTEM_PROMPT = f"""You are Marta, the AI call/SMS assistant for FutureCafe.
You talk naturally and help with:
- Menu questions, placing orders, hours/location, and reservations (lightweight).
- If the user asks for pizza/order: list choices from the MENU and ask for missing details (size, quantity, etc.).
- If user provides all details, confirm the order in words (no need to return JSON), include a brief total using MENU prices.
- For hours/location, reply from MENU.
- For unrelated topics, gently steer back to FutureCafe; if the user remains off-topic for 3 turns total, politely end.
- Keep replies concise and friendly. No long explanations.
MENU (JSON you can read from for options & prices):
{MENU_JSON}
"""
# ---------------- llama.cpp singleton ----------------
_llm = None
def _get_local_llm():
"""Singleton llama.cpp model loader (GGUF)."""
global _llm
if _llm is not None:
return _llm
from llama_cpp import Llama
s = get_settings()
model_path = os.getenv("LLAMACPP_MODEL_PATH", getattr(s, "LLAMACPP_MODEL_PATH", None))
if not model_path or not os.path.exists(model_path):
raise RuntimeError(f"LLAMACPP_MODEL_PATH not found: {model_path}")
_llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=os.cpu_count() or 4,
n_gpu_layers=0, # CPU by default
verbose=False,
)
return _llm
def _apply_chat_template(messages: List[Dict[str, str]]) -> str:
parts = []
for m in messages:
role = m.get("role", "user")
content = m.get("content", "")
if role == "system":
parts.append(f"<|system|>\n{content}\n")
elif role == "user":
parts.append(f"<|user|>\n{content}\n")
else:
parts.append(f"<|assistant|>\n{content}\n")
parts.append("<|assistant|>\n")
return "\n".join(parts)
def _generate(messages: List[Dict[str, str]], temperature=0.3, max_tokens=320) -> str:
llm = _get_local_llm()
prompt = _apply_chat_template(messages)
out = llm(
prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=0.9,
repeat_penalty=1.1,
stop=["<|user|>", "<|system|>", "<|assistant|>"],
)
return (out["choices"][0]["text"] or "").strip()
def respond_chat(
history: List[Dict[str, str]],
user_text: str,
guard_state: Dict[str, Any] | None,
) -> Tuple[str, Dict[str, Any], Dict[str, Any]]:
"""
LLM-only conversational brain.
Returns: (assistant_text, new_guard_state, diag)
guard_state: {"unrelated": int, "ended": int, "limit": int}
"""
guard = dict(guard_state or {"unrelated": 0, "ended": 0, "limit": 3})
if guard.get("ended"):
return "(Conversation ended. Start a new chat for FutureCafe.)", guard, {}
msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
if history:
msgs.extend(history[-10:])
msgs.append({"role": "user", "content": user_text})
reply = _generate(msgs)
# A super-light off-topic guard without keywords: If the model signals ending, we respect it.
# Otherwise, keep conversation flowing; we do not hard-code keywords or intents here.
# (We still maintain the 'unrelated' counter if you later want to nudge based on signals.)
if "Let’s end" in reply or "Let's end" in reply:
guard["ended"] = 1
return reply, guard, {} # no tool_result/diagnostics needed for this simpler flow