# models/tts_router.py from __future__ import annotations import os import re import uuid import wave import glob import subprocess from shutil import which from typing import Optional RUNTIME_AUDIO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "runtime", "audio")) AUDIO_DIR = os.path.join(os.path.dirname(__file__), "..", "runtime", "audio") os.makedirs(AUDIO_DIR, exist_ok=True) def cleanup_old_audio(keep_latest: Optional[str] = None): """Delete all audio files in runtime/audio except the one to keep.""" for f in glob.glob(os.path.join(AUDIO_DIR, "*")): # keep both .wav/.aiff just in case engine produced AIFF if keep_latest and os.path.abspath(f) == os.path.abspath(keep_latest): continue if f.endswith((".wav", ".aiff")): try: os.remove(f) except Exception as e: print(f"[CLEANUP] Could not delete {f}: {e}") def ensure_runtime_audio_dir() -> str: os.makedirs(RUNTIME_AUDIO_DIR, exist_ok=True) return RUNTIME_AUDIO_DIR def _have(cmd: str) -> bool: return which(cmd) is not None def _is_valid_wav(path: str, min_duration_s: float = 0.25) -> bool: try: with wave.open(path, "rb") as w: frames = w.getnframes() rate = w.getframerate() dur = (frames / float(rate)) if rate else 0.0 if frames <= 0 or rate <= 0 or dur < min_duration_s: return False except Exception: return False return True def _tts_with_piper(text: str) -> Optional[str]: """ Use local Piper if available. Env: - PIPER_MODEL: path to models/piper/.onnx - PIPER_BIN (optional): override binary name/path (default 'piper') """ model = os.getenv("PIPER_MODEL") if not model or not os.path.exists(model): return None piper_bin = os.getenv("PIPER_BIN", "piper") if not _have(piper_bin) and not os.path.isabs(piper_bin): # If the user passed an absolute path, we try it even if not in PATH return None out_dir = ensure_runtime_audio_dir() out_path = os.path.join(out_dir, f"tts_{uuid.uuid4().hex}.wav") safe_text = re.sub(r"[\x00-\x1F]+", " ", text).strip() try: p = subprocess.Popen( [piper_bin, "--model", model, "--output_file", out_path], stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ) p.communicate(input=safe_text.encode("utf-8"), timeout=45) if p.returncode == 0 and os.path.exists(out_path) and _is_valid_wav(out_path): return os.path.abspath(out_path) except Exception as e: print("[TTS] Piper error:", e) return None def _tts_with_say(text: str) -> Optional[str]: """ macOS `say` fallback. Produces WAV via afconvert or ffmpeg if present; else returns AIFF path. Env: - SAY_VOICE (optional): e.g., "Samantha" / "Alex" """ if os.name != "posix" or not _have("say"): return None out_dir = ensure_runtime_audio_dir() aiff = os.path.join(out_dir, f"tts_{uuid.uuid4().hex}.aiff") wav = os.path.join(out_dir, f"tts_{uuid.uuid4().hex}.wav") voice = os.getenv("SAY_VOICE") safe_text = re.sub(r"[\x00-\x1F`<>]+", " ", text).strip() or "Hello." try: cmd = ["say", "-o", aiff] if voice: cmd.extend(["-v", voice]) cmd.append(safe_text) subprocess.run(cmd, check=True) except Exception as e: print("[TTS] say failed:", e) return None converted = False if which("afconvert"): try: subprocess.run( ["afconvert", "-f", "WAVE", "-d", "LEI16", "-c", "1", "-s", "1", aiff, wav], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ) converted = True except Exception: converted = False if not converted and which("ffmpeg"): try: subprocess.run( ["ffmpeg", "-y", "-i", aiff, "-ar", "22050", "-ac", "1", wav], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ) converted = True except Exception: converted = False if converted and os.path.exists(wav) and _is_valid_wav(wav): try: os.remove(aiff) except Exception: pass return os.path.abspath(wav) if os.path.exists(aiff): # AIFF is fine as a fallback (Gradio can usually play it) return os.path.abspath(aiff) return None def tts_synthesize(text: str) -> Optional[str]: """ High-level TTS router: 1) Piper (if configured) 2) macOS 'say' 3) None Always writes to runtime/audio and prunes older files. """ if not (text and text.strip()): return None ensure_runtime_audio_dir() out = _tts_with_piper(text) if out: cleanup_old_audio(keep_latest=out) return out out = _tts_with_say(text) if out: cleanup_old_audio(keep_latest=out) return out return None