# models/asr_whisper.py from __future__ import annotations import os from typing import Optional, Dict, Any from faster_whisper import WhisperModel from utils.config import get_settings _asr_singleton: Optional["WhisperASR"] = None def _norm_device(req: str) -> str: """faster-whisper supports only 'cpu' or 'cuda'.""" r = (req or "cpu").strip().lower() if r == "mps": print("[ASR] 'mps' is not supported by faster-whisper; falling back to CPU.") return "cpu" return r if r in ("cpu", "cuda") else "cpu" def _compute_type_for(device: str) -> str: # Keep it simple and stable for HF/macOS: # - CPU: int8 (fast, small) # - CUDA: float16 (good default on GPUs) return "float16" if device == "cuda" else "int8" class WhisperASR: def __init__(self): s = get_settings() self.model_size = os.getenv("WHISPER_SIZE", "tiny").strip() # tiny|base|small|medium|large-v3 ... self.language = os.getenv("WHISPER_LANG", "").strip() or None # e.g., "en"; None = auto self.device = _norm_device(getattr(s, "ASR_DEVICE", "cpu")) self.compute_type = _compute_type_for(self.device) print(f"[ASR] Loading faster-whisper: size={self.model_size} device={self.device} compute_type={self.compute_type}") self.model = WhisperModel(self.model_size, device=self.device, compute_type=self.compute_type) def transcribe(self, path: str) -> Dict[str, Any]: """ Returns: {"text": str, "language": str|None, "segments": [...]} """ # language=None lets faster-whisper auto-detect. You can force via WHISPER_LANG. segments, info = self.model.transcribe( path, beam_size=1, language=self.language or None, ) text = " ".join((seg.text or "").strip() for seg in segments).strip() return { "text": text or "", "language": getattr(info, "language", None), # You can expose timings later if you want: "segments": [] # keep lightweight for UI } def get_asr() -> WhisperASR: global _asr_singleton if _asr_singleton is None: _asr_singleton = WhisperASR() return _asr_singleton