Eyob-Sol commited on
Commit
3328f65
·
verified ·
1 Parent(s): 9ba120a

Upload 18 files

Browse files
.gitattributes CHANGED
@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  runtime/audio/tts_3bac9b920ffa4a6a93a9eed5ca215bea.wav filter=lfs diff=lfs merge=lfs -text
37
  runtime/audio/tts_fc786b49aad940e4992413247701abf3.wav filter=lfs diff=lfs merge=lfs -text
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  runtime/audio/tts_3bac9b920ffa4a6a93a9eed5ca215bea.wav filter=lfs diff=lfs merge=lfs -text
37
  runtime/audio/tts_fc786b49aad940e4992413247701abf3.wav filter=lfs diff=lfs merge=lfs -text
38
+ runtime/audio/tts_4056705ada224a0092325b697c975501.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,4 +1,7 @@
1
  from app.gradio_app import build_demo
 
 
 
2
 
3
  def main():
4
  demo = build_demo()
 
1
  from app.gradio_app import build_demo
2
+ # in app.py or when building the demo
3
+ from models.tts_router import cleanup_old_audio
4
+ cleanup_old_audio(keep_latest=None) # removes all existing tts_*.wav on boot
5
 
6
  def main():
7
  demo = build_demo()
app/__pycache__/gradio_app.cpython-312.pyc CHANGED
Binary files a/app/__pycache__/gradio_app.cpython-312.pyc and b/app/__pycache__/gradio_app.cpython-312.pyc differ
 
app/gradio_app.py CHANGED
@@ -24,7 +24,8 @@ except Exception:
24
  llm_respond_chat = None
25
 
26
  from models.tts_router import tts_synthesize, ensure_runtime_audio_dir
27
-
 
28
 
29
  # =============================================================================
30
  # Helpers (pure, modular)
@@ -46,6 +47,15 @@ def _safe_llm_reply(history: List[Dict[str, str]], user_text: str) -> str:
46
  return "Hello! How can I assist you today? Would you like to place an order or inquire about the menu?"
47
 
48
 
 
 
 
 
 
 
 
 
 
49
  def _asr_transcribe(aud_path: str) -> str:
50
  """
51
  Transcribe audio to text. If ASR is unavailable, return a safe message.
@@ -247,13 +257,85 @@ def build_demo():
247
  # Only clears the recorder input; leaves assistant audio + transcripts intact
248
  return gr.update(value=None, interactive=True)
249
 
250
- def on_voice_change(aud_path: str | None, vhist: List[Dict[str, str]]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  if not aud_path:
252
- # no audio; keep everything as-is
253
- return vhist or [], None, {"intent": None, "slots": {}, "tool_selected": None, "tool_result": None, "latency_ms": 0}
 
 
 
 
 
254
 
255
- new_vhist, tts_path, diag = handle_voice_turn(aud_path, vhist or [])
256
- return new_vhist, tts_path, diag
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
  def on_text_send(txt: str, hist: List[Dict[str, str]]):
259
  new_hist, diag, clear_text = handle_text_turn(txt, hist or [])
 
24
  llm_respond_chat = None
25
 
26
  from models.tts_router import tts_synthesize, ensure_runtime_audio_dir
27
+ from models.tts_router import tts_synthesize, cleanup_old_audio, AUDIO_DIR
28
+ import shutil, uuid, os
29
 
30
  # =============================================================================
31
  # Helpers (pure, modular)
 
47
  return "Hello! How can I assist you today? Would you like to place an order or inquire about the menu?"
48
 
49
 
50
+ def _persist_copy(src_path: str) -> str | None:
51
+ """Copy mic recording into runtime/audio with a stable filename, returns the new path."""
52
+ if not (src_path and os.path.exists(src_path)):
53
+ return None
54
+ os.makedirs(AUDIO_DIR, exist_ok=True)
55
+ dst = os.path.join(AUDIO_DIR, f"user_{uuid.uuid4().hex}.wav")
56
+ shutil.copyfile(src_path, dst)
57
+ return dst
58
+
59
  def _asr_transcribe(aud_path: str) -> str:
60
  """
61
  Transcribe audio to text. If ASR is unavailable, return a safe message.
 
257
  # Only clears the recorder input; leaves assistant audio + transcripts intact
258
  return gr.update(value=None, interactive=True)
259
 
260
+ def on_voice_change(aud_path: str | None, voice_hist: list[dict] | None):
261
+ """
262
+ - Copy mic recording into runtime/audio
263
+ - ASR -> transcript
264
+ - LLM -> bot text
265
+ - TTS -> assistant wav in runtime/audio
266
+ - Cleanup:
267
+ * delete the user clip immediately after ASR
268
+ * delete all older TTS, keep only latest one
269
+ - Append transcript pairs to voice chat state
270
+ """
271
+ import time
272
+ empty_diag = {
273
+ "intent": None,
274
+ "slots": {},
275
+ "tool_selected": None,
276
+ "tool_result": None,
277
+ "latency_ms": 0,
278
+ }
279
  if not aud_path:
280
+ return (
281
+ voice_hist or [],
282
+ None, # assistant_audio
283
+ empty_diag,
284
+ None, # clear recorder (handled elsewhere if you chain a clear)
285
+ voice_hist or []
286
+ )
287
 
288
+ t0 = time.time()
289
+ # 1) Stabilize mic path into runtime/audio
290
+ stable_user = _persist_copy(aud_path)
291
+
292
+ # 2) Transcribe
293
+ try:
294
+ asr = get_asr()
295
+ asr_out = asr.transcribe(stable_user)
296
+ transcript = (asr_out.get("text") or "").strip() or "(no speech detected)"
297
+ finally:
298
+ # Remove the user clip ASAP to keep the folder small
299
+ if stable_user and os.path.exists(stable_user):
300
+ try:
301
+ os.remove(stable_user)
302
+ except Exception as e:
303
+ print("[CLEANUP] Could not delete user clip:", e)
304
+
305
+ # 3) Get bot reply (LLM response)
306
+ try:
307
+ from models.llm_chat import respond_chat_voice
308
+ except Exception:
309
+ # Fallback: reuse text chat function if you don’t have a voice-specific one
310
+ from models.llm_chat import respond_chat as respond_chat_voice
311
+
312
+ bot_text, new_policy, policy_diag = respond_chat_voice(voice_hist or [], transcript, {})
313
+
314
+ # 4) TTS the bot reply into runtime/audio
315
+ new_tts = tts_synthesize(bot_text) # this writes into runtime/audio
316
+ # Keep only the latest TTS (delete older tts_*.wav)
317
+ cleanup_old_audio(keep_latest=new_tts)
318
+
319
+ # 5) Append to voice chat state (text transcripts)
320
+ new_hist = (voice_hist or []) + [
321
+ {"role": "user", "content": transcript},
322
+ {"role": "assistant", "content": bot_text},
323
+ ]
324
+
325
+ diag = {
326
+ "intent": policy_diag.get("policy") if isinstance(policy_diag, dict) else None,
327
+ "slots": {},
328
+ "tool_selected": None,
329
+ "tool_result": {
330
+ "transcript": transcript,
331
+ "llm_response": bot_text,
332
+ "policy": policy_diag,
333
+ },
334
+ "latency_ms": int((time.time() - t0) * 1000),
335
+ }
336
+
337
+ # Return: (voice_chat, assistant_audio_path, diag, recorder_clear, voice_state)
338
+ return new_hist, new_tts, diag, gr.update(value=None), new_hist
339
 
340
  def on_text_send(txt: str, hist: List[Dict[str, str]]):
341
  new_hist, diag, clear_text = handle_text_turn(txt, hist or [])
runtime/audio/tts_4056705ada224a0092325b697c975501.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429b36576536c38844ca7d9eec6264400595f2a0ec7024cb0044bf0bccefb1f7
3
+ size 337448