Spaces:

Eyob-Sol
/

futurecafe-voice-core

Sleeping

App Files Files Community

Eyob-Sol commited on Sep 18

Commit

9b25064

verified ·

1 Parent(s): 51d45ce

Update app/gradio_app.py

Browse files

Files changed (1) hide show

app/gradio_app.py +31 -65

app/gradio_app.py CHANGED Viewed

@@ -260,90 +260,56 @@ def build_demo():
             return gr.update(value=None, interactive=True)
         def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
-            """
-            - Copy mic recording into runtime/audio
-            - ASR -> transcript
-            - LLM -> bot text
-            - TTS -> assistant wav in runtime/audio
-            - Cleanup:
-                * delete the user clip immediately after ASR
-                * delete all older TTS, keep only latest one
-            - Append transcript pairs to voice chat state
-            Returns exactly: (voice_chat_messages, assistant_audio_path, diag_json)
-            """
-            empty_diag = {
-                "intent": None,
-                "slots": {},
-                "tool_selected": None,
-                "tool_result": None,
-                "latency_ms": 0,
-            }
             if not aud_path:
-                return (voice_hist or []), None, empty_diag
             t0 = time.time()
-            # 1) Stabilize mic path into runtime/audio (ENV-driven via AUDIO_DIR)
-            stable_user = _persist_copy(aud_path)
-            # 2) Transcribe
             transcript = "(transcription failed)"
             try:
-                if get_asr is None:
-                    transcript = "(ASR unavailable)"
-                else:
-                    asr = get_asr()
-                    asr_out = asr.transcribe(stable_user)
-                    transcript = (asr_out.get("text") or "").strip() or "(no speech detected)"
             except Exception as e:
-                print("[ASR] error:", e)
-            finally:
-                # Remove the user clip ASAP to keep the folder small
-                if stable_user and os.path.exists(stable_user):
-                    try:
-                        os.remove(stable_user)
-                    except Exception as e:
-                        print("[CLEANUP] Could not delete user clip:", e)
-            # 3) Get bot reply (LLM response) – env/model-path handled inside models/llm_chat
             try:
                 from models.llm_chat import respond_chat_voice
             except Exception:
                 from models.llm_chat import respond_chat as respond_chat_voice
-            try:
-                bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
-            except Exception as e:
-                print("[LLM] voice fallback due to error:", e)
-                bot_text, new_policy, policy_diag = (
-                    "Hello! How can I help with FutureCafe today?",
-                    {},
-                    {},
-                )
-            # 4) TTS the bot reply into runtime/audio (ENV-driven path via tts_router)
-            new_tts = tts_synthesize(bot_text)  # path in VOICE_AUDIO_DIR
-            cleanup_old_audio(keep_latest=new_tts)
-            # 5) Append to voice chat state (text transcripts)
             new_hist = (voice_hist or []) + [
                 {"role": "user", "content": transcript},
                 {"role": "assistant", "content": bot_text},
             ]
             diag = {
                 "intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
                 "slots": {},
                 "tool_selected": None,
-                "tool_result": {
-                    "transcript": transcript,
-                    "llm_response": bot_text,
-                    "policy": policy_diag,
-                },
                 "latency_ms": int((time.time() - t0) * 1000),
             }
-            # Return EXACTLY the 3 outputs you wired: (voice_chat, assistant_audio, call_diag)
             return new_hist, new_tts, diag
         def on_text_send(txt: str, hist: List[Dict[str, str]]):

             return gr.update(value=None, interactive=True)
         def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
+            import time, sys
+            def log(*a): print("[VOICE]", *a, file=sys.stderr, flush=True)
+            empty_diag = {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}
             if not aud_path:
+                return (voice_hist or []), None, empty_diag  # <= keep shapes correct
             t0 = time.time()
+            # 1) Copy mic clip into runtime/audio (stable path) – optional now; your ASR can read original path
+            stable_user = aud_path  # you can keep original if you like; not required for ASR
+            log("input:", stable_user)
+            # 2) ASR
             transcript = "(transcription failed)"
             try:
+                asr = get_asr()
+                out = asr.transcribe(stable_user)
+                transcript = (out.get("text") or "").strip() or "(no speech detected)"
             except Exception as e:
+                log("ASR error:", e)
+            # 3) LLM
             try:
                 from models.llm_chat import respond_chat_voice
             except Exception:
                 from models.llm_chat import respond_chat as respond_chat_voice
+            bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
+            log("transcript:", transcript)
+            log("bot_text:", bot_text)
+            # 4) TTS
+            new_tts = tts_synthesize(bot_text)
+            log("tts_out:", new_tts, os.path.exists(new_tts) if new_tts else None)
+            # 5) Update voice transcript chat (text)
             new_hist = (voice_hist or []) + [
                 {"role": "user", "content": transcript},
                 {"role": "assistant", "content": bot_text},
             ]
             diag = {
                 "intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
                 "slots": {},
                 "tool_selected": None,
+                "tool_result": {"transcript": transcript, "llm_response": bot_text, "policy": policy_diag},
                 "latency_ms": int((time.time() - t0) * 1000),
             }
+            # Return exactly what your outputs expect (don’t clear assistant audio here)
             return new_hist, new_tts, diag
         def on_text_send(txt: str, hist: List[Dict[str, str]]):