# models/tts_router.py from __future__ import annotations import os, re, uuid, wave, glob, subprocess, sys from shutil import which from typing import Optional # Where we write audio AUDIO_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "runtime", "audio")) os.makedirs(AUDIO_DIR, exist_ok=True) def _dbg(*args): if os.getenv("DEBUG", "false").lower() in ("1","true","yes","on"): print("[TTS]", *args, file=sys.stderr, flush=True) def cleanup_old_audio(keep_latest: Optional[str] = None): for f in glob.glob(os.path.join(AUDIO_DIR, "*")): if keep_latest and os.path.abspath(f) == os.path.abspath(keep_latest): continue if f.endswith((".wav", ".aiff")): try: os.remove(f) except Exception as e: _dbg(f"cleanup skip {f}: {e}") def ensure_runtime_audio_dir() -> str: os.makedirs(AUDIO_DIR, exist_ok=True) return AUDIO_DIR def _have(cmd: str) -> bool: # allow absolute path (even if not “in PATH”) return os.path.isabs(cmd) and os.path.exists(cmd) or which(cmd) is not None def _is_valid_wav(path: str, min_duration_s: float = 0.25) -> bool: try: with wave.open(path, "rb") as w: frames = w.getnframes() rate = w.getframerate() dur = (frames / float(rate)) if rate else 0.0 return frames > 0 and rate > 0 and dur >= min_duration_s except Exception as e: _dbg("wav check failed:", e) return False def _tts_with_piper(text: str) -> Optional[str]: model = os.getenv("PIPER_MODEL") piper_bin = os.getenv("PIPER_BIN", "piper") if not model or not os.path.exists(model): _dbg("piper: missing/invalid PIPER_MODEL:", model) return None if not _have(piper_bin): _dbg("piper: binary not found:", piper_bin) return None out_dir = ensure_runtime_audio_dir() out_path = os.path.join(out_dir, f"tts_{uuid.uuid4().hex}.wav") safe_text = re.sub(r"[\x00-\x1F]+", " ", text).strip() or "Hello." cmd = [piper_bin, "--model", model, "--output_file", out_path] # Optional tuning if os.getenv("PIPER_LENGTH_SCALE"): cmd += ["--length_scale", os.getenv("PIPER_LENGTH_SCALE")] if os.getenv("PIPER_NOISE_SCALE"): cmd += ["--noise_scale", os.getenv("PIPER_NOISE_SCALE")] _dbg("piper cmd:", " ".join(cmd)) try: p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate(input=safe_text.encode("utf-8"), timeout=60) _dbg("piper rc:", p.returncode) if stderr: _dbg("piper stderr:", stderr.decode("utf-8", errors="ignore")[:400]) if p.returncode == 0 and os.path.exists(out_path) and _is_valid_wav(out_path): _dbg("piper ok ->", out_path) return os.path.abspath(out_path) else: _dbg("piper wrote invalid or no wav at:", out_path) except Exception as e: _dbg("piper error:", e) return None def _tts_with_say(text: str) -> Optional[str]: if os.name != "posix" or not _have("say"): _dbg("say: not available") return None out_dir = ensure_runtime_audio_dir() aiff = os.path.join(out_dir, f"tts_{uuid.uuid4().hex}.aiff") wav = os.path.join(out_dir, f"tts_{uuid.uuid4().hex}.wav") voice = os.getenv("SAY_VOICE") # e.g., "Samantha" safe_text = re.sub(r"[\x00-\x1F`<>]+", " ", text).strip() or "Hello." cmd = ["say", "-o", aiff] if voice: cmd += ["-v", voice] cmd.append(safe_text) _dbg("say cmd:", " ".join(cmd)) try: subprocess.run(cmd, check=True) except Exception as e: _dbg("say failed:", e) return None converted = False if which("afconvert"): try: subprocess.run( ["afconvert", "-f", "WAVE", "-d", "LEI16", "-c", "1", "-s", "1", aiff, wav], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ); converted = True except Exception as e: _dbg("afconvert failed:", e) if not converted and which("ffmpeg"): try: subprocess.run( ["ffmpeg", "-y", "-i", aiff, "-ar", "22050", "-ac", "1", wav], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ); converted = True except Exception as e: _dbg("ffmpeg failed:", e) if converted and os.path.exists(wav) and _is_valid_wav(wav): try: os.remove(aiff) except: pass _dbg("say ok ->", wav) return os.path.abspath(wav) if os.path.exists(aiff): _dbg("say fallback AIFF ->", aiff) return os.path.abspath(aiff) return None def tts_synthesize(text: str) -> Optional[str]: if not (text and text.strip()): _dbg("skip empty text") return None ensure_runtime_audio_dir() # Try Piper first (Linux/HF) out = _tts_with_piper(text) if out: cleanup_old_audio(keep_latest=out) return out # macOS fallback out = _tts_with_say(text) if out: cleanup_old_audio(keep_latest=out) return out _dbg("no TTS backend produced audio") return None