File size: 2,233 Bytes
74bb5fe
ac1f51b
 
 
74bb5fe
 
 
ac1f51b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74bb5fe
 
 
 
ac1f51b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74bb5fe
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# models/asr_whisper.py
from __future__ import annotations
import os
from typing import Optional, Dict, Any
from faster_whisper import WhisperModel
from utils.config import get_settings

_asr_singleton: Optional["WhisperASR"] = None


def _norm_device(req: str) -> str:
    """faster-whisper supports only 'cpu' or 'cuda'."""
    r = (req or "cpu").strip().lower()
    if r == "mps":
        print("[ASR] 'mps' is not supported by faster-whisper; falling back to CPU.")
        return "cpu"
    return r if r in ("cpu", "cuda") else "cpu"


def _compute_type_for(device: str) -> str:
    # Keep it simple and stable for HF/macOS:
    # - CPU: int8 (fast, small)
    # - CUDA: float16 (good default on GPUs)
    return "float16" if device == "cuda" else "int8"


class WhisperASR:
    def __init__(self):
        s = get_settings()
        self.model_size = os.getenv("WHISPER_SIZE", "tiny").strip()  # tiny|base|small|medium|large-v3 ...
        self.language = os.getenv("WHISPER_LANG", "").strip() or None  # e.g., "en"; None = auto

        self.device = _norm_device(getattr(s, "ASR_DEVICE", "cpu"))
        self.compute_type = _compute_type_for(self.device)

        print(f"[ASR] Loading faster-whisper: size={self.model_size} device={self.device} compute_type={self.compute_type}")
        self.model = WhisperModel(self.model_size, device=self.device, compute_type=self.compute_type)

    def transcribe(self, path: str) -> Dict[str, Any]:
        """
        Returns: {"text": str, "language": str|None, "segments": [...]}
        """
        # language=None lets faster-whisper auto-detect. You can force via WHISPER_LANG.
        segments, info = self.model.transcribe(
            path,
            beam_size=1,
            language=self.language or None,
        )
        text = " ".join((seg.text or "").strip() for seg in segments).strip()
        return {
            "text": text or "",
            "language": getattr(info, "language", None),
            # You can expose timings later if you want:
            "segments": []  # keep lightweight for UI
        }


def get_asr() -> WhisperASR:
    global _asr_singleton
    if _asr_singleton is None:
        _asr_singleton = WhisperASR()
    return _asr_singleton