File size: 10,695 Bytes
74bb5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# app/gradio_app.py
from __future__ import annotations

import os
import time
import shutil
import uuid
from typing import List, Dict, Any, Tuple

import gradio as gr

# ---- External modules we rely on (light, stable) ----
# - ASR: faster-whisper wrapper you already have
# - TTS: local piper/ say via models/tts_router.py
# - LLM: optional local model; if missing, we fallback to a safe canned reply
try:
    from models.asr_whisper import get_asr
except Exception:
    get_asr = None

try:
    from models.llm_chat import respond_chat as llm_respond_chat
except Exception:
    llm_respond_chat = None

from models.tts_router import tts_synthesize, ensure_runtime_audio_dir


# =============================================================================
# Helpers (pure, modular)
# =============================================================================

def _safe_llm_reply(history: List[Dict[str, str]], user_text: str) -> str:
    """
    Ask the chat LLM for a response. If it's not available, use a reasonable fallback.
    """
    if llm_respond_chat is not None:
        try:
            # policy guard is optional; pass an empty dict
            bot_text, _guard, _diag = llm_respond_chat(history or [], user_text, {})
            if isinstance(bot_text, str) and bot_text.strip():
                return bot_text.strip()
        except Exception as e:
            print("[LLM] fallback due to error:", e)
    # Fallback (LLM unavailable or failed)
    return "Hello! How can I assist you today? Would you like to place an order or inquire about the menu?"


def _asr_transcribe(aud_path: str) -> str:
    """
    Transcribe audio to text. If ASR is unavailable, return a safe message.
    """
    if not aud_path:
        return "(no audio)"
    if get_asr is None:
        return "(ASR unavailable)"
    try:
        asr = get_asr()
        out = asr.transcribe(aud_path)
        return (out.get("text") or "").strip() or "(no speech detected)"
    except Exception as e:
        print("[ASR] error:", e)
        return "(transcription failed)"


def _tts_from_text(text: str) -> str | None:
    """
    Synthesize assistant text to a WAV in runtime/audio.
    Returns a file path or None.
    """
    if not (text and text.strip()):
        return None
    path = tts_synthesize(text.strip())
    if path and os.path.exists(path):
        return path
    # always attempt one more minimal fallback to avoid empty path
    return tts_synthesize("How can I help with FutureCafe?")


def _append_chat(history: List[Dict[str, str]] | None,
                 role: str, content: str) -> List[Dict[str, str]]:
    hist = list(history or [])
    hist.append({"role": role, "content": content})
    return hist


def _startup_clean_runtime_audio():
    """
    On app start, clean previous session audio artifacts.
    """
    audio_dir = ensure_runtime_audio_dir()
    try:
        for name in os.listdir(audio_dir):
            p = os.path.join(audio_dir, name)
            if os.path.isfile(p):
                os.remove(p)
    except Exception as e:
        print("[RUNTIME] Cannot clean runtime/audio:", e)


# =============================================================================
# Voice handlers (modular)
# =============================================================================

def handle_voice_turn(
    user_audio_path: str,
    voice_history: List[Dict[str, str]] | None
) -> Tuple[List[Dict[str, str]], str | None, Dict[str, Any]]:
    """
    Single voice turn:
      1) Transcribe user audio
      2) Ask LLM for a reply (text)
      3) TTS the reply to a WAV
      4) Append both transcript and assistant text to the voice chat history

    Returns: (new_voice_history, assistant_audio_path, diag_json)
    """
    t0 = time.time()

    transcript = _asr_transcribe(user_audio_path)
    hist1 = _append_chat(voice_history, "user", transcript)

    bot_text = _safe_llm_reply(hist1, transcript)
    hist2 = _append_chat(hist1, "assistant", bot_text)

    tts_path = _tts_from_text(bot_text)

    diag = {
        "intent": None,
        "slots": {},
        "tool_selected": None,
        "tool_result": {
            "transcript": transcript,
            "llm_response": bot_text
        },
        "latency_ms": int((time.time() - t0) * 1000),
    }

    return hist2, tts_path, diag


# =============================================================================
# Text handlers (modular)
# =============================================================================

def handle_text_turn(
    user_text: str,
    chat_history: List[Dict[str, str]] | None
) -> Tuple[List[Dict[str, str]], Dict[str, Any], str]:
    """
    Single text turn:
      1) Append user text
      2) Ask LLM for a reply
      3) Append assistant text
      4) Prepare diagnostics
    Returns: (new_chat_history, diag_json, clear_text_value)
    """
    t0 = time.time()
    user_text = (user_text or "").strip()
    if not user_text:
        return (chat_history or []), {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}, ""

    hist1 = _append_chat(chat_history, "user", user_text)
    bot_text = _safe_llm_reply(hist1, user_text)
    hist2 = _append_chat(hist1, "assistant", bot_text)

    diag = {
        "intent": None,
        "slots": {},
        "tool_selected": None,
        "tool_result": {"user": user_text, "llm_response": bot_text},
        "latency_ms": int((time.time() - t0) * 1000),
    }

    return hist2, diag, ""


# =============================================================================
# Fixed UI (as requested) + wiring
# =============================================================================

def build_demo():
    """
    Fixed UI layout:
      LEFT (Voice Call):
        - voice_in (mic recorder)
        - assistant_audio (autoplay)
        - voice_chat (transcript chat)
        - call_diag (JSON)
      RIGHT (SMS/Chat):
        - chat_box
        - text_in (enter to send)
        - chat_diag (JSON)
    """
    _startup_clean_runtime_audio()

    with gr.Blocks(title="FutureCafe Call/SMS Agent (MVP)") as demo:
        gr.Markdown("### ☎️ FutureCafe AI Agent (MVP)\n**Call (voice)** on the left · **SMS/Chat** on the right")

        # States
        voice_state = gr.State([])   # list of {"role","content"} for voice transcript chat
        chat_state  = gr.State([])   # list of {"role","content"} for SMS chat

        with gr.Row():
            # ---------------- LEFT: VOICE ----------------
            with gr.Column(scale=1, min_width=430):
                gr.Markdown("#### 📞 Voice Call")
                voice_in = gr.Audio(
                    label="Press Record → Speak → Stop (auto-sends)",
                    sources=["microphone"],
                    type="filepath",
                    format="wav",
                    interactive=True,
                    editable=False,
                    waveform_options={"show_recording_waveform": True},
                )

                assistant_audio = gr.Audio(
                    label="Assistant Response (auto-play)",
                    autoplay=True,
                    type="filepath",
                    interactive=False
                )

                voice_chat = gr.Chatbot(value=[], type="messages", height=220, label="Voice Chat (transcripts)")

                call_diag = gr.JSON(
                    value={"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0},
                    label="Voice Diagnostics"
                )

            # ---------------- RIGHT: SMS / CHAT ----------------
            with gr.Column(scale=1, min_width=430):
                gr.Markdown("#### 💬 SMS / Chat")
                chat_box = gr.Chatbot(value=[], type="messages", height=360, label=None)
                text_in = gr.Textbox(
                    placeholder="Type here… e.g., “Any vegan pizzas?”, “Book a table for 2 at 7.” (Enter to send)",
                    label=None, lines=1
                )
                chat_diag = gr.JSON(
                    value={"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0},
                    label="Chat Diagnostics"
                )

        # ---------- Handlers (thin wrappers that call modular functions) ----------
        def _clear_recorder():
            # Only clears the recorder input; leaves assistant audio + transcripts intact
            return gr.update(value=None, interactive=True)

        def on_voice_change(aud_path: str | None, vhist: List[Dict[str, str]]):
            if not aud_path:
                # no audio; keep everything as-is
                return vhist or [], None, {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}

            new_vhist, tts_path, diag = handle_voice_turn(aud_path, vhist or [])
            return new_vhist, tts_path, diag

        def on_text_send(txt: str, hist: List[Dict[str, str]]):
            new_hist, diag, clear_text = handle_text_turn(txt, hist or [])
            return new_hist, diag, clear_text

        # ---------- Wiring ----------
        # Voice lane: update (voice_chat, assistant_audio, call_diag), do NOT clear recorder to keep it stable for now
        # Try to fire on explicit Stop; fall back to generic change if not supported
        rec_event = getattr(voice_in, "stop_recording", None)
        if callable(rec_event):
            rec_event(
                on_voice_change,
                inputs=[voice_in, voice_state],
                outputs=[voice_chat, assistant_audio, call_diag],
            ).then(
                _clear_recorder,          # runs AFTER outputs are set → autoplay isn’t interrupted
                inputs=None,
                outputs=[voice_in],
            )
        else:
            voice_in.change(
                on_voice_change,
                inputs=[voice_in, voice_state],
                outputs=[voice_chat, assistant_audio, call_diag],
            ).then(
                _clear_recorder,
                inputs=None,
                outputs=[voice_in],
            )

        # Keep voice_state in sync with what's shown in voice_chat (unchanged)
        voice_chat.change(lambda x: x, inputs=[voice_chat], outputs=[voice_state])

        # Text lane: Enter to send
        text_in.submit(
            on_text_send,
            inputs=[text_in, chat_state],
            outputs=[chat_box, chat_diag, text_in],
        )
        # Keep chat_state in sync with what's shown in chat_box
        chat_box.change(lambda x: x, inputs=[chat_box], outputs=[chat_state])

    return demo