# models/llm_chat.py
from __future__ import annotations
from typing import List, Dict, Any, Tuple
import os

from utils.config import get_settings

# --- Small, readable menu JSON kept in the system prompt for now ---
MENU_JSON = """
{
  "pizzas": [
    {"name": "Margherita Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 8.5, "medium": 11.0, "large": 13.5}},
    {"name": "Pepperoni Pizza", "sizes": ["small", "medium", "large"], "price": {"small": 9.5, "medium": 12.0, "large": 14.5}}
  ],
  "salads": [
    {"name": "House Salad", "sizes": ["regular"], "price": {"regular": 6.0}}
  ],
  "drinks": [
    {"name": "Cola", "sizes": ["can"], "price": {"can": 2.0}}
  ],
  "hours": "11:00–22:00 daily",
  "address": "123 Main St",
  "phone": "+1 (555) 010-0000"
}
"""

SYSTEM_PROMPT = f"""You are Marta, the AI call/SMS assistant for FutureCafe.
You talk naturally and help with:
- Menu questions, placing orders, hours/location, and reservations (lightweight).
- If the user asks for pizza/order: list choices from the MENU and ask for missing details (size, quantity, etc.).
- If user provides all details, confirm the order in words (no need to return JSON), include a brief total using MENU prices.
- For hours/location, reply from MENU.
- For unrelated topics, gently steer back to FutureCafe; if the user remains off-topic for 3 turns total, politely end.
- Keep replies concise and friendly. No long explanations.

MENU (JSON you can read from for options & prices):
{MENU_JSON}
"""

# ---------------- llama.cpp singleton ----------------
_llm = None

def _get_local_llm():
    """Singleton llama.cpp model loader (GGUF)."""
    global _llm
    if _llm is not None:
        return _llm
    from llama_cpp import Llama
    s = get_settings()
    model_path = os.getenv("LLAMACPP_MODEL_PATH", getattr(s, "LLAMACPP_MODEL_PATH", None))
    if not model_path or not os.path.exists(model_path):
        raise RuntimeError(f"LLAMACPP_MODEL_PATH not found: {model_path}")
    _llm = Llama(
        model_path=model_path,
        n_ctx=2048,
        n_threads=os.cpu_count() or 4,
        n_gpu_layers=0,  # CPU by default
        verbose=False,
    )
    return _llm

def _apply_chat_template(messages: List[Dict[str, str]]) -> str:
    parts = []
    for m in messages:
        role = m.get("role", "user")
        content = m.get("content", "")
        if role == "system":
            parts.append(f"<|system|>\n{content}\n")
        elif role == "user":
            parts.append(f"<|user|>\n{content}\n")
        else:
            parts.append(f"<|assistant|>\n{content}\n")
    parts.append("<|assistant|>\n")
    return "\n".join(parts)

def _generate(messages: List[Dict[str, str]], temperature=0.3, max_tokens=320) -> str:
    llm = _get_local_llm()
    prompt = _apply_chat_template(messages)
    out = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=0.9,
        repeat_penalty=1.1,
        stop=["<|user|>", "<|system|>", "<|assistant|>"],
    )
    return (out["choices"][0]["text"] or "").strip()

def respond_chat(
    history: List[Dict[str, str]],
    user_text: str,
    guard_state: Dict[str, Any] | None,
) -> Tuple[str, Dict[str, Any], Dict[str, Any]]:
    """
    LLM-only conversational brain.
    Returns: (assistant_text, new_guard_state, diag)
    guard_state: {"unrelated": int, "ended": int, "limit": int}
    """
    guard = dict(guard_state or {"unrelated": 0, "ended": 0, "limit": 3})
    if guard.get("ended"):
        return "(Conversation ended. Start a new chat for FutureCafe.)", guard, {}

    msgs: List[Dict[str, str]] = [{"role": "system", "content": SYSTEM_PROMPT}]
    if history:
        msgs.extend(history[-10:])
    msgs.append({"role": "user", "content": user_text})

    reply = _generate(msgs)

    # A super-light off-topic guard without keywords: If the model signals ending, we respect it.
    # Otherwise, keep conversation flowing; we do not hard-code keywords or intents here.
    # (We still maintain the 'unrelated' counter if you later want to nudge based on signals.)
    if "Let’s end" in reply or "Let's end" in reply:
        guard["ended"] = 1

    return reply, guard, {}  # no tool_result/diagnostics needed for this simpler flow