Spaces:
Sleeping
Sleeping
Update app/gradio_app.py
Browse files- app/gradio_app.py +31 -65
app/gradio_app.py
CHANGED
|
@@ -260,90 +260,56 @@ def build_demo():
|
|
| 260 |
return gr.update(value=None, interactive=True)
|
| 261 |
|
| 262 |
def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
- TTS -> assistant wav in runtime/audio
|
| 268 |
-
- Cleanup:
|
| 269 |
-
* delete the user clip immediately after ASR
|
| 270 |
-
* delete all older TTS, keep only latest one
|
| 271 |
-
- Append transcript pairs to voice chat state
|
| 272 |
-
Returns exactly: (voice_chat_messages, assistant_audio_path, diag_json)
|
| 273 |
-
"""
|
| 274 |
-
empty_diag = {
|
| 275 |
-
"intent": None,
|
| 276 |
-
"slots": {},
|
| 277 |
-
"tool_selected": None,
|
| 278 |
-
"tool_result": None,
|
| 279 |
-
"latency_ms": 0,
|
| 280 |
-
}
|
| 281 |
if not aud_path:
|
| 282 |
-
return (voice_hist or []), None, empty_diag
|
| 283 |
-
|
| 284 |
t0 = time.time()
|
| 285 |
-
|
| 286 |
-
#
|
| 287 |
-
stable_user
|
| 288 |
-
|
| 289 |
-
# 2)
|
| 290 |
transcript = "(transcription failed)"
|
| 291 |
try:
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
asr = get_asr()
|
| 296 |
-
asr_out = asr.transcribe(stable_user)
|
| 297 |
-
transcript = (asr_out.get("text") or "").strip() or "(no speech detected)"
|
| 298 |
except Exception as e:
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
if stable_user and os.path.exists(stable_user):
|
| 303 |
-
try:
|
| 304 |
-
os.remove(stable_user)
|
| 305 |
-
except Exception as e:
|
| 306 |
-
print("[CLEANUP] Could not delete user clip:", e)
|
| 307 |
-
|
| 308 |
-
# 3) Get bot reply (LLM response) – env/model-path handled inside models/llm_chat
|
| 309 |
try:
|
| 310 |
from models.llm_chat import respond_chat_voice
|
| 311 |
except Exception:
|
| 312 |
from models.llm_chat import respond_chat as respond_chat_voice
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
# 4) TTS the bot reply into runtime/audio (ENV-driven path via tts_router)
|
| 325 |
-
new_tts = tts_synthesize(bot_text) # path in VOICE_AUDIO_DIR
|
| 326 |
-
cleanup_old_audio(keep_latest=new_tts)
|
| 327 |
-
|
| 328 |
-
# 5) Append to voice chat state (text transcripts)
|
| 329 |
new_hist = (voice_hist or []) + [
|
| 330 |
{"role": "user", "content": transcript},
|
| 331 |
{"role": "assistant", "content": bot_text},
|
| 332 |
]
|
| 333 |
-
|
| 334 |
diag = {
|
| 335 |
"intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
|
| 336 |
"slots": {},
|
| 337 |
"tool_selected": None,
|
| 338 |
-
"tool_result": {
|
| 339 |
-
"transcript": transcript,
|
| 340 |
-
"llm_response": bot_text,
|
| 341 |
-
"policy": policy_diag,
|
| 342 |
-
},
|
| 343 |
"latency_ms": int((time.time() - t0) * 1000),
|
| 344 |
}
|
| 345 |
-
|
| 346 |
-
# Return
|
| 347 |
return new_hist, new_tts, diag
|
| 348 |
|
| 349 |
def on_text_send(txt: str, hist: List[Dict[str, str]]):
|
|
|
|
| 260 |
return gr.update(value=None, interactive=True)
|
| 261 |
|
| 262 |
def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
|
| 263 |
+
import time, sys
|
| 264 |
+
def log(*a): print("[VOICE]", *a, file=sys.stderr, flush=True)
|
| 265 |
+
|
| 266 |
+
empty_diag = {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
if not aud_path:
|
| 268 |
+
return (voice_hist or []), None, empty_diag # <= keep shapes correct
|
| 269 |
+
|
| 270 |
t0 = time.time()
|
| 271 |
+
# 1) Copy mic clip into runtime/audio (stable path) – optional now; your ASR can read original path
|
| 272 |
+
stable_user = aud_path # you can keep original if you like; not required for ASR
|
| 273 |
+
log("input:", stable_user)
|
| 274 |
+
|
| 275 |
+
# 2) ASR
|
| 276 |
transcript = "(transcription failed)"
|
| 277 |
try:
|
| 278 |
+
asr = get_asr()
|
| 279 |
+
out = asr.transcribe(stable_user)
|
| 280 |
+
transcript = (out.get("text") or "").strip() or "(no speech detected)"
|
|
|
|
|
|
|
|
|
|
| 281 |
except Exception as e:
|
| 282 |
+
log("ASR error:", e)
|
| 283 |
+
|
| 284 |
+
# 3) LLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
try:
|
| 286 |
from models.llm_chat import respond_chat_voice
|
| 287 |
except Exception:
|
| 288 |
from models.llm_chat import respond_chat as respond_chat_voice
|
| 289 |
+
|
| 290 |
+
bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
|
| 291 |
+
log("transcript:", transcript)
|
| 292 |
+
log("bot_text:", bot_text)
|
| 293 |
+
|
| 294 |
+
# 4) TTS
|
| 295 |
+
new_tts = tts_synthesize(bot_text)
|
| 296 |
+
log("tts_out:", new_tts, os.path.exists(new_tts) if new_tts else None)
|
| 297 |
+
|
| 298 |
+
# 5) Update voice transcript chat (text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
new_hist = (voice_hist or []) + [
|
| 300 |
{"role": "user", "content": transcript},
|
| 301 |
{"role": "assistant", "content": bot_text},
|
| 302 |
]
|
| 303 |
+
|
| 304 |
diag = {
|
| 305 |
"intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
|
| 306 |
"slots": {},
|
| 307 |
"tool_selected": None,
|
| 308 |
+
"tool_result": {"transcript": transcript, "llm_response": bot_text, "policy": policy_diag},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
"latency_ms": int((time.time() - t0) * 1000),
|
| 310 |
}
|
| 311 |
+
|
| 312 |
+
# Return exactly what your outputs expect (don’t clear assistant audio here)
|
| 313 |
return new_hist, new_tts, diag
|
| 314 |
|
| 315 |
def on_text_send(txt: str, hist: List[Dict[str, str]]):
|