File size: 13,442 Bytes
74bb5fe
 
 
 
ac1f51b
74bb5fe
 
 
 
 
 
 
 
 
 
 
323cc17
ac1f51b
 
 
 
 
74bb5fe
 
 
 
 
 
 
ac1f51b
74bb5fe
ac1f51b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74bb5fe
 
 
3328f65
 
 
 
323cc17
 
3328f65
 
 
74bb5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3328f65
9b25064
 
 
 
74bb5fe
9b25064
 
3328f65
9b25064
 
 
 
 
ac1f51b
3328f65
9b25064
 
 
ac1f51b
9b25064
 
 
3328f65
 
 
 
9b25064
 
 
 
 
 
 
7a4a72d
9b25064
 
 
3328f65
 
 
 
9b25064
3328f65
 
 
 
9b25064
3328f65
 
9b25064
 
ac1f51b
74bb5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
e57667d
74bb5fe
 
 
 
 
 
 
 
 
 
e57667d
74bb5fe
 
 
 
 
 
 
 
 
 
 
 
 
 
e57667d
74bb5fe
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# app/gradio_app.py
from __future__ import annotations
from typing import List, Dict, Any, Tuple

import os, time, shutil, uuid
import gradio as gr

try:
    from models.asr_whisper import get_asr
except Exception:
    get_asr = None

try:
    from models.llm_chat import respond_chat as llm_respond_chat
except Exception:
    llm_respond_chat = None
import shutil, uuid, os
from models.tts_router import (
    tts_synthesize,
    ensure_runtime_audio_dir,
    cleanup_old_audio,
)

# =============================================================================
# Helpers (pure, modular)
# =============================================================================

def _safe_llm_reply(history: List[Dict[str, str]], user_text: str) -> str:
    """
    Try local LLM. If it's missing or errors, log loudly and return a safe fallback.
    """
    if llm_respond_chat is None:
        print("[LLM] respond_chat not imported; using fallback.")
        return "Hello! How can I assist you today? Would you like to place an order or inquire about the menu?"

    try:
        bot_text, _guard, _diag = llm_respond_chat(history or [], user_text, {})
        if isinstance(bot_text, str) and bot_text.strip():
            print("[LLM] returned:", bot_text[:120].replace("\n"," "))
            return bot_text.strip()
        else:
            print("[LLM] empty/invalid response; using fallback.")
    except Exception as e:
        import traceback
        print("[LLM] error -> fallback:", repr(e))
        traceback.print_exc()

    return "Hello! How can I assist you today? Would you like to place an order or inquire about the menu?"


def _persist_copy(src_path: str) -> str | None:
    """Copy mic recording into runtime/audio with a stable filename, returns the new path."""
    if not (src_path and os.path.exists(src_path)):
        return None
    audio_dir = ensure_runtime_audio_dir()   # <-- ask tts_router for the proper dir
    dst = os.path.join(audio_dir, f"user_{uuid.uuid4().hex}.wav")
    shutil.copyfile(src_path, dst)
    return dst

def _asr_transcribe(aud_path: str) -> str:
    """
    Transcribe audio to text. If ASR is unavailable, return a safe message.
    """
    if not aud_path:
        return "(no audio)"
    if get_asr is None:
        return "(ASR unavailable)"
    try:
        asr = get_asr()
        out = asr.transcribe(aud_path)
        return (out.get("text") or "").strip() or "(no speech detected)"
    except Exception as e:
        print("[ASR] error:", e)
        return "(transcription failed)"


def _tts_from_text(text: str) -> str | None:
    """
    Synthesize assistant text to a WAV in runtime/audio.
    Returns a file path or None.
    """
    if not (text and text.strip()):
        return None
    path = tts_synthesize(text.strip())
    if path and os.path.exists(path):
        return path
    # always attempt one more minimal fallback to avoid empty path
    return tts_synthesize("How can I help with FutureCafe?")


def _append_chat(history: List[Dict[str, str]] | None,
                 role: str, content: str) -> List[Dict[str, str]]:
    hist = list(history or [])
    hist.append({"role": role, "content": content})
    return hist


def _startup_clean_runtime_audio():
    """
    On app start, clean previous session audio artifacts.
    """
    audio_dir = ensure_runtime_audio_dir()
    try:
        for name in os.listdir(audio_dir):
            p = os.path.join(audio_dir, name)
            if os.path.isfile(p):
                os.remove(p)
    except Exception as e:
        print("[RUNTIME] Cannot clean runtime/audio:", e)


# =============================================================================
# Voice handlers (modular)
# =============================================================================

def handle_voice_turn(
    user_audio_path: str,
    voice_history: List[Dict[str, str]] | None
) -> Tuple[List[Dict[str, str]], str | None, Dict[str, Any]]:
    """
    Single voice turn:
      1) Transcribe user audio
      2) Ask LLM for a reply (text)
      3) TTS the reply to a WAV
      4) Append both transcript and assistant text to the voice chat history

    Returns: (new_voice_history, assistant_audio_path, diag_json)
    """
    t0 = time.time()

    transcript = _asr_transcribe(user_audio_path)
    hist1 = _append_chat(voice_history, "user", transcript)

    bot_text = _safe_llm_reply(hist1, transcript)
    hist2 = _append_chat(hist1, "assistant", bot_text)

    tts_path = _tts_from_text(bot_text)

    diag = {
        "intent": None,
        "slots": {},
        "tool_selected": None,
        "tool_result": {
            "transcript": transcript,
            "llm_response": bot_text
        },
        "latency_ms": int((time.time() - t0) * 1000),
    }

    return hist2, tts_path, diag


# =============================================================================
# Text handlers (modular)
# =============================================================================

def handle_text_turn(
    user_text: str,
    chat_history: List[Dict[str, str]] | None
) -> Tuple[List[Dict[str, str]], Dict[str, Any], str]:
    """
    Single text turn:
      1) Append user text
      2) Ask LLM for a reply
      3) Append assistant text
      4) Prepare diagnostics
    Returns: (new_chat_history, diag_json, clear_text_value)
    """
    t0 = time.time()
    user_text = (user_text or "").strip()
    if not user_text:
        return (chat_history or []), {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}, ""

    hist1 = _append_chat(chat_history, "user", user_text)
    bot_text = _safe_llm_reply(hist1, user_text)
    hist2 = _append_chat(hist1, "assistant", bot_text)

    diag = {
        "intent": None,
        "slots": {},
        "tool_selected": None,
        "tool_result": {"user": user_text, "llm_response": bot_text},
        "latency_ms": int((time.time() - t0) * 1000),
    }

    return hist2, diag, ""


# =============================================================================
# Fixed UI (as requested) + wiring
# =============================================================================

def build_demo():
    """
    Fixed UI layout:
      LEFT (Voice Call):
        - voice_in (mic recorder)
        - assistant_audio (autoplay)
        - voice_chat (transcript chat)
        - call_diag (JSON)
      RIGHT (SMS/Chat):
        - chat_box
        - text_in (enter to send)
        - chat_diag (JSON)
    """
    _startup_clean_runtime_audio()

    with gr.Blocks(title="FutureCafe Call/SMS Agent (MVP)") as demo:
        gr.Markdown("### ☎️ FutureCafe AI Agent (MVP)\n**Call (voice)** on the left · **SMS/Chat** on the right")

        # States
        voice_state = gr.State([])   # list of {"role","content"} for voice transcript chat
        chat_state  = gr.State([])   # list of {"role","content"} for SMS chat

        with gr.Row():
            # ---------------- LEFT: VOICE ----------------
            with gr.Column(scale=1, min_width=430):
                gr.Markdown("#### 📞 Voice Call")
                voice_in = gr.Audio(
                    label="Press Record → Speak → Stop (auto-sends)",
                    sources=["microphone"],
                    type="filepath",
                    format="wav",
                    interactive=True,
                    editable=False,
                    waveform_options={"show_recording_waveform": True},
                )

                assistant_audio = gr.Audio(
                    label="Assistant Response (auto-play)",
                    autoplay=True,
                    type="filepath",
                    interactive=False
                )

                voice_chat = gr.Chatbot(value=[], type="messages", height=220, label="Voice Chat (transcripts)")

                call_diag = gr.JSON(
                    value={"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0},
                    label="Voice Diagnostics"
                )

            # ---------------- RIGHT: SMS / CHAT ----------------
            with gr.Column(scale=1, min_width=430):
                gr.Markdown("#### 💬 SMS / Chat")
                chat_box = gr.Chatbot(value=[], type="messages", height=360, label=None)
                text_in = gr.Textbox(
                    placeholder="Type here… e.g., “Any vegan pizzas?”, “Book a table for 2 at 7.” (Enter to send)",
                    label=None, lines=1
                )
                chat_diag = gr.JSON(
                    value={"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0},
                    label="Chat Diagnostics"
                )

        # ---------- Handlers (thin wrappers that call modular functions) ----------
        def _clear_recorder():
            # Only clears the recorder input; leaves assistant audio + transcripts intact
            return gr.update(value=None, interactive=True)

        def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
            import time, sys
            def log(*a): print("[VOICE]", *a, file=sys.stderr, flush=True)
        
            empty_diag = {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}
            if not aud_path:
                return (voice_hist or []), None, empty_diag  # <= keep shapes correct
        
            t0 = time.time()
            # 1) Copy mic clip into runtime/audio (stable path) – optional now; your ASR can read original path
            stable_user = aud_path  # you can keep original if you like; not required for ASR
            log("input:", stable_user)
        
            # 2) ASR
            transcript = "(transcription failed)"
            try:
                asr = get_asr()
                out = asr.transcribe(stable_user)
                transcript = (out.get("text") or "").strip() or "(no speech detected)"
            except Exception as e:
                log("ASR error:", e)
        
            # 3) LLM
            try:
                from models.llm_chat import respond_chat_voice
            except Exception:
                from models.llm_chat import respond_chat as respond_chat_voice
        
            bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
            log("transcript:", transcript)
            log("bot_text:", bot_text)
        
            # 4) TTS
            new_tts = tts_synthesize(bot_text)
            print("[VOICE][HF] bot_text len=", len(bot_text), " new_tts=", new_tts, " exists=", (os.path.exists(new_tts) if new_tts else None))
            log("tts_out:", new_tts, os.path.exists(new_tts) if new_tts else None)
        
            # 5) Update voice transcript chat (text)
            new_hist = (voice_hist or []) + [
                {"role": "user", "content": transcript},
                {"role": "assistant", "content": bot_text},
            ]
        
            diag = {
                "intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
                "slots": {},
                "tool_selected": None,
                "tool_result": {"transcript": transcript, "llm_response": bot_text, "policy": policy_diag},
                "latency_ms": int((time.time() - t0) * 1000),
            }
        
            # Return exactly what your outputs expect (don’t clear assistant audio here)
            return new_hist, new_tts, diag

        def on_text_send(txt: str, hist: List[Dict[str, str]]):
            new_hist, diag, clear_text = handle_text_turn(txt, hist or [])
            return new_hist, diag, clear_text

        # ---------- Wiring ----------
        # Voice lane: update (voice_chat, assistant_audio, call_diag), do NOT clear recorder to keep it stable for now
        # Try to fire on explicit Stop; fall back to generic change if not supported
        rec_event = getattr(voice_in, "stop_recording", None)
        if callable(rec_event):
            rec_event(
                on_voice_change,
                inputs=[voice_in, voice_state],
                outputs=[voice_chat, assistant_audio, call_diag],
                api_name="chat_voice",  # <— add this
            ).then(
                _clear_recorder,          # runs AFTER outputs are set → autoplay isn’t interrupted
                inputs=None,
                outputs=[voice_in],
            )
        else:
            voice_in.change(
                on_voice_change,
                inputs=[voice_in, voice_state],
                outputs=[voice_chat, assistant_audio, call_diag],
                api_name="chat_voice",  # <— add this
            ).then(
                _clear_recorder,
                inputs=None,
                outputs=[voice_in],
            )

        # Keep voice_state in sync with what's shown in voice_chat (unchanged)
        voice_chat.change(lambda x: x, inputs=[voice_chat], outputs=[voice_state])

        # Text lane: Enter to send
        text_in.submit(
            on_text_send,
            inputs=[text_in, chat_state],
            outputs=[chat_box, chat_diag, text_in],
            api_name="chat_text",  # <— add this 
        )
        # Keep chat_state in sync with what's shown in chat_box
        chat_box.change(lambda x: x, inputs=[chat_box], outputs=[chat_state])

    return demo