Spaces:

Eyob-Sol
/

futurecafe-voice-core

Sleeping

App Files Files Community

futurecafe-voice-core / models /llm_chat.py

Eyob-Sol

Upload 38 files

74bb5fe verified 3 months ago

raw

history blame

4.27 kB

	# models/llm_chat.py
	from __future__ import annotations
	from typing import List, Dict, Any, Tuple
	import os

	from utils.config import get_settings

	# --- Small, readable menu JSON kept in the system prompt for now ---
	MENU_JSON = """
	{
	"pizzas": [
	{"name": "Margherita Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 8.5, "medium": 11.0, "large": 13.5}},
	{"name": "Pepperoni Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 9.5, "medium": 12.0, "large": 14.5}}
	],
	"salads": [
	{"name": "House Salad", "sizes": ["regular"], "price": {"regular": 6.0}}
	],
	"drinks": [
	{"name": "Cola", "sizes": ["can"], "price": {"can": 2.0}}
	],
	"hours": "11:00–22:00 daily",
	"address": "123 Main St",
	"phone": "+1 (555) 010-0000"
	}
	"""

	SYSTEM_PROMPT = f"""You are Marta, the AI call/SMS assistant for FutureCafe.
	You talk naturally and help with:
	- Menu questions, placing orders, hours/location, and reservations (lightweight).
	- If the user asks for pizza/order: list choices from the MENU and ask for missing details (size, quantity, etc.).
	- If user provides all details, confirm the order in words (no need to return JSON), include a brief total using MENU prices.
	- For hours/location, reply from MENU.
	- For unrelated topics, gently steer back to FutureCafe; if the user remains off-topic for 3 turns total, politely end.
	- Keep replies concise and friendly. No long explanations.

	MENU (JSON you can read from for options & prices):
	{MENU_JSON}
	"""

	# ---------------- llama.cpp singleton ----------------
	_llm = None

	def _get_local_llm():
	"""Singleton llama.cpp model loader (GGUF)."""
	global _llm
	if _llm is not None:
	return _llm
	from llama_cpp import Llama
	s = get_settings()
	model_path = os.getenv("LLAMACPP_MODEL_PATH", getattr(s, "LLAMACPP_MODEL_PATH", None))
	if not model_path or not os.path.exists(model_path):
	raise RuntimeError(f"LLAMACPP_MODEL_PATH not found: {model_path}")
	_llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=os.cpu_count() or 4,
	n_gpu_layers=0, # CPU by default
	verbose=False,
	)
	return _llm

	def _apply_chat_template(messages: List[Dict[str, str]]) -> str:
	parts = []
	for m in messages:
	role = m.get("role", "user")
	content = m.get("content", "")
	if role == "system":
	parts.append(f"<\|system\|>\n{content}\n")
	elif role == "user":
	parts.append(f"<\|user\|>\n{content}\n")
	else:
	parts.append(f"<\|assistant\|>\n{content}\n")
	parts.append("<\|assistant\|>\n")
	return "\n".join(parts)

	def _generate(messages: List[Dict[str, str]], temperature=0.3, max_tokens=320) -> str:
	llm = _get_local_llm()
	prompt = _apply_chat_template(messages)
	out = llm(
	prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=0.9,
	repeat_penalty=1.1,
	stop=["<\|user\|>", "<\|system\|>", "<\|assistant\|>"],
	)
	return (out["choices"][0]["text"] or "").strip()

	def respond_chat(
	history: List[Dict[str, str]],
	user_text: str,
	guard_state: Dict[str, Any] \| None,
	) -> Tuple[str, Dict[str, Any], Dict[str, Any]]:
	"""
	LLM-only conversational brain.
	Returns: (assistant_text, new_guard_state, diag)
	guard_state: {"unrelated": int, "ended": int, "limit": int}
	"""
	guard = dict(guard_state or {"unrelated": 0, "ended": 0, "limit": 3})
	if guard.get("ended"):
	return "(Conversation ended. Start a new chat for FutureCafe.)", guard, {}

	msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
	if history:
	msgs.extend(history[-10:])
	msgs.append({"role": "user", "content": user_text})

	reply = _generate(msgs)

	# A super-light off-topic guard without keywords: If the model signals ending, we respect it.
	# Otherwise, keep conversation flowing; we do not hard-code keywords or intents here.
	# (We still maintain the 'unrelated' counter if you later want to nudge based on signals.)
	if "Let’s end" in reply or "Let's end" in reply:
	guard["ended"] = 1

	return reply, guard, {} # no tool_result/diagnostics needed for this simpler flow