Eyob-Sol commited on
Commit
9b25064
·
verified ·
1 Parent(s): 51d45ce

Update app/gradio_app.py

Browse files
Files changed (1) hide show
  1. app/gradio_app.py +31 -65
app/gradio_app.py CHANGED
@@ -260,90 +260,56 @@ def build_demo():
260
  return gr.update(value=None, interactive=True)
261
 
262
  def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
263
- """
264
- - Copy mic recording into runtime/audio
265
- - ASR -> transcript
266
- - LLM -> bot text
267
- - TTS -> assistant wav in runtime/audio
268
- - Cleanup:
269
- * delete the user clip immediately after ASR
270
- * delete all older TTS, keep only latest one
271
- - Append transcript pairs to voice chat state
272
- Returns exactly: (voice_chat_messages, assistant_audio_path, diag_json)
273
- """
274
- empty_diag = {
275
- "intent": None,
276
- "slots": {},
277
- "tool_selected": None,
278
- "tool_result": None,
279
- "latency_ms": 0,
280
- }
281
  if not aud_path:
282
- return (voice_hist or []), None, empty_diag
283
-
284
  t0 = time.time()
285
-
286
- # 1) Stabilize mic path into runtime/audio (ENV-driven via AUDIO_DIR)
287
- stable_user = _persist_copy(aud_path)
288
-
289
- # 2) Transcribe
290
  transcript = "(transcription failed)"
291
  try:
292
- if get_asr is None:
293
- transcript = "(ASR unavailable)"
294
- else:
295
- asr = get_asr()
296
- asr_out = asr.transcribe(stable_user)
297
- transcript = (asr_out.get("text") or "").strip() or "(no speech detected)"
298
  except Exception as e:
299
- print("[ASR] error:", e)
300
- finally:
301
- # Remove the user clip ASAP to keep the folder small
302
- if stable_user and os.path.exists(stable_user):
303
- try:
304
- os.remove(stable_user)
305
- except Exception as e:
306
- print("[CLEANUP] Could not delete user clip:", e)
307
-
308
- # 3) Get bot reply (LLM response) – env/model-path handled inside models/llm_chat
309
  try:
310
  from models.llm_chat import respond_chat_voice
311
  except Exception:
312
  from models.llm_chat import respond_chat as respond_chat_voice
313
-
314
- try:
315
- bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
316
- except Exception as e:
317
- print("[LLM] voice fallback due to error:", e)
318
- bot_text, new_policy, policy_diag = (
319
- "Hello! How can I help with FutureCafe today?",
320
- {},
321
- {},
322
- )
323
-
324
- # 4) TTS the bot reply into runtime/audio (ENV-driven path via tts_router)
325
- new_tts = tts_synthesize(bot_text) # path in VOICE_AUDIO_DIR
326
- cleanup_old_audio(keep_latest=new_tts)
327
-
328
- # 5) Append to voice chat state (text transcripts)
329
  new_hist = (voice_hist or []) + [
330
  {"role": "user", "content": transcript},
331
  {"role": "assistant", "content": bot_text},
332
  ]
333
-
334
  diag = {
335
  "intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
336
  "slots": {},
337
  "tool_selected": None,
338
- "tool_result": {
339
- "transcript": transcript,
340
- "llm_response": bot_text,
341
- "policy": policy_diag,
342
- },
343
  "latency_ms": int((time.time() - t0) * 1000),
344
  }
345
-
346
- # Return EXACTLY the 3 outputs you wired: (voice_chat, assistant_audio, call_diag)
347
  return new_hist, new_tts, diag
348
 
349
  def on_text_send(txt: str, hist: List[Dict[str, str]]):
 
260
  return gr.update(value=None, interactive=True)
261
 
262
  def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
263
+ import time, sys
264
+ def log(*a): print("[VOICE]", *a, file=sys.stderr, flush=True)
265
+
266
+ empty_diag = {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  if not aud_path:
268
+ return (voice_hist or []), None, empty_diag # <= keep shapes correct
269
+
270
  t0 = time.time()
271
+ # 1) Copy mic clip into runtime/audio (stable path) – optional now; your ASR can read original path
272
+ stable_user = aud_path # you can keep original if you like; not required for ASR
273
+ log("input:", stable_user)
274
+
275
+ # 2) ASR
276
  transcript = "(transcription failed)"
277
  try:
278
+ asr = get_asr()
279
+ out = asr.transcribe(stable_user)
280
+ transcript = (out.get("text") or "").strip() or "(no speech detected)"
 
 
 
281
  except Exception as e:
282
+ log("ASR error:", e)
283
+
284
+ # 3) LLM
 
 
 
 
 
 
 
285
  try:
286
  from models.llm_chat import respond_chat_voice
287
  except Exception:
288
  from models.llm_chat import respond_chat as respond_chat_voice
289
+
290
+ bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
291
+ log("transcript:", transcript)
292
+ log("bot_text:", bot_text)
293
+
294
+ # 4) TTS
295
+ new_tts = tts_synthesize(bot_text)
296
+ log("tts_out:", new_tts, os.path.exists(new_tts) if new_tts else None)
297
+
298
+ # 5) Update voice transcript chat (text)
 
 
 
 
 
 
299
  new_hist = (voice_hist or []) + [
300
  {"role": "user", "content": transcript},
301
  {"role": "assistant", "content": bot_text},
302
  ]
303
+
304
  diag = {
305
  "intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
306
  "slots": {},
307
  "tool_selected": None,
308
+ "tool_result": {"transcript": transcript, "llm_response": bot_text, "policy": policy_diag},
 
 
 
 
309
  "latency_ms": int((time.time() - t0) * 1000),
310
  }
311
+
312
+ # Return exactly what your outputs expect (don’t clear assistant audio here)
313
  return new_hist, new_tts, diag
314
 
315
  def on_text_send(txt: str, hist: List[Dict[str, str]]):