Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 21 days ago

Commit

287f01b

verified ·

1 Parent(s): 467c0ec

Upload 17 files

Browse files

Files changed (7) hide show

README.md +3 -3
api.py +98 -0
audio_tools.py +468 -746
background_descriptor.py +116 -10
config.yaml +60 -64
llm_router.py +5 -11
scripts/remote_clients.py +78 -0

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
 ---
-title: Veureu Engine (Docker)
 emoji: 🎧
 colorFrom: gray
 colorTo: blue
 sdk: docker
-app_file: main_api.py
 pinned: false
 ---
-# Veureu Engine API (FastAPI via Docker)
 Endpoints:
 - `POST /process_video`

 ---
+title: veureu-engine
 emoji: 🎧
 colorFrom: gray
 colorTo: blue
 sdk: docker
+app_file: api.py
 pinned: false
 ---
+# veureu-engine
 Endpoints:
 - `POST /process_video`

api.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from __future__ import annotations
+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pathlib import Path
+import shutil
+import uvicorn
+import json
+from video_processing import process_video_pipeline
+from casting_loader import ensure_chroma, build_faces_index, build_voices_index
+from narration_system import NarrationSystem
+from llm_router import load_yaml, LLMRouter
+app = FastAPI(title="Veureu Engine API", version="0.2.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+ROOT = Path("/tmp/veureu")
+ROOT.mkdir(parents=True, exist_ok=True)
+@app.get("/")
+def root():
+    return {"ok": True, "service": "veureu-engine"}
+@app.post("/process_video")
+async def process_video(
+    video_file: UploadFile = File(...),
+    config_path: str = Form("config.yaml"),
+    out_root: str = Form("results"),
+    db_dir: str = Form("chroma_db"),
+):
+    tmp_video = ROOT / video_file.filename
+    with tmp_video.open("wb") as f:
+        shutil.copyfileobj(video_file.file, f)
+    result = process_video_pipeline(str(tmp_video), config_path=config_path, out_root=out_root, db_dir=db_dir)
+    return JSONResponse(result)
+@app.post("/load_casting")
+async def load_casting(
+    faces_dir: str = Form("identities/faces"),
+    voices_dir: str = Form("identities/voices"),
+    db_dir: str = Form("chroma_db"),
+    drop_collections: bool = Form(False),
+):
+    client = ensure_chroma(Path(db_dir))
+    n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
+    n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
+    return {"ok": True, "faces": n_faces, "voices": n_voices}
+@app.post("/refine_narration")
+async def refine_narration(
+    dialogues_srt: str = Form(...),
+    frame_descriptions_json: str = Form("[]"),
+    config_path: str = Form("config.yaml"),
+):
+    cfg = load_yaml(config_path)
+    frames = json.loads(frame_descriptions_json)
+    model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
+    use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
+    if use_remote:
+        router = LLMRouter(cfg)
+        system_msg = (
+            "Eres un sistema de audiodescripción que cumple UNE-153010. "
+            "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
+            "Devuelve JSON con {narrative_text, srt_text}."
+        )
+        prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
+        try:
+            txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
+            out = {}
+            try:
+                out = json.loads(txt)
+            except Exception:
+                out = {"narrative_text": txt, "srt_text": ""}
+            return {
+                "narrative_text": out.get("narrative_text", ""),
+                "srt_text": out.get("srt_text", ""),
+                "approved": True,
+                "critic_feedback": "",
+            }
+        except Exception:
+            ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt"))
+            res = ns.run(dialogues_srt, frames)
+            return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
+    ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
+    out = ns.run(dialogues_srt, frames)
+    return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

audio_tools.py CHANGED Viewed

@@ -1,746 +1,468 @@
-# audio_tools.py
-# -----------------------------------------------------------------------------
-# Veureu — AUDIO utilities (self-contained)
-#  - FFmpeg extraction (WAV)
-#  - Diarization (pyannote)
-#  - ASR:
-#      * Catalan ("ca") -> AINA Whisper
-#      * Other languages -> Lightweight generic Whisper
-#  - Integrated Language ID (Whisper via faster-whisper)
-#  - Voice embeddings (SpeechBrain ECAPA)
-#  - Speaker identification (KMeans + optional ChromaDB collection)
-#  - SRT generation
-#  - Orchestrator: process_audio_for_video(...)
-#  - ADDED: ASR of full audio and LLM-based SRT correction
-# -----------------------------------------------------------------------------
-from __future__ import annotations
-import numpy as np
-import json
-import logging
-import math
-import os
-import shlex
-import subprocess
-from pathlib import Path
-from typing import List, Dict, Any, Tuple, Optional
-from dataclasses import dataclass
-# al principio de audio_tools.py
-try:
-    import torchaudio as ta
-    HAS_TORCHAUDIO = True
-except ImportError:
-    ta = None
-    HAS_TORCHAUDIO = False
-import soundfile as sf
-import torch
-import torchaudio.transforms as T
-from pydub import AudioSegment
-from pyannote.audio import Pipeline
-from speechbrain.pretrained import SpeakerRecognition
-from sklearn.cluster import KMeans
-from sklearn.metrics import silhouette_score
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
-from openai import OpenAI as OpenAIClient
-import noisereduce as nr
-# -------------------------------- Logging ------------------------------------
-log = logging.getLogger("audio_tools")
-if not log.handlers:
-    _h = logging.StreamHandler()
-    _h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
-    log.addHandler(_h)
-log.setLevel(logging.INFO)
-# ------------------------------- Utilities -----------------------------------
-def load_wav(path, sr=16000):
-    if HAS_TORCHAUDIO:
-        wav, in_sr = ta.load(path)
-        if in_sr != sr:
-            wav = ta.functional.resample(wav, in_sr, sr)
-        return wav.squeeze(0).numpy(), sr
-    # fallback con soundfile + resample con librosa
-    import librosa
-    y, in_sr = sf.read(path, dtype="float32", always_2d=False)
-    if in_sr != sr:
-        y = librosa.resample(y, orig_sr=in_sr, target_sr=sr)
-    return y.astype(np.float32), sr
-def save_wav(path, y, sr=16000):
-    if HAS_TORCHAUDIO:
-        ta.save(path, torch.from_numpy(y).unsqueeze(0), sr)  # si usas torch
-    else:
-        sf.write(path, y, sr)
-def _pick_device_auto(dev_cfg: str) -> str:
-    """Resolve 'auto' device to cuda/cpu."""
-    if dev_cfg == "auto":
-        return "cuda" if torch.cuda.is_available() else "cpu"
-    return dev_cfg
-def load_config(path: str = "configs/config_veureu.yaml") -> Dict[str, Any]:
-    p = Path(path)
-    if not p.exists():
-        log.warning("Config file not found: %s (using defaults)", path)
-        return {}
-    try:
-        import yaml
-        cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
-        cfg["__path__"] = str(p)
-        return cfg
-    except Exception as e:
-        log.error("Failed to read YAML config: %s", e)
-        return {}
-# ------------------------------- Extraction ----------------------------------
-def extract_audio_ffmpeg(
-    video_path: str,
-    audio_out: Path,
-    sr: int = 16000,
-    mono: bool = True,
-) -> str:
-    """Extract audio from video to WAV using ffmpeg."""
-    audio_out.parent.mkdir(parents=True, exist_ok=True)
-    cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
-    subprocess.run(
-        shlex.split(cmd),
-        check=True,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
-    )
-    return str(audio_out)
-# ----------------------------------- ASR -------------------------------------
-@dataclass
-class AinaASR:
-    """ASR for Catalan using the AINA Whisper model."""
-    model_name: str = "projecte-aina/whisper-large-v3-ca-3catparla"
-    device: str = "cuda"
-    def __post_init__(self):
-        dev = self.device
-        if dev == "cuda" and not torch.cuda.is_available():
-            dev = "cpu"
-        self.processor = WhisperProcessor.from_pretrained(self.model_name)
-        self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(dev)
-        self.device = dev
-        log.info(f"ASR AINA loaded on {self.device}: {self.model_name}")
-    def transcribe_wav(self, wav_path: str) -> str:
-        waveform, sr = torchaudio.load(wav_path)
-        inputs = self.processor(
-            waveform.numpy(), sampling_rate=sr, return_tensors="pt"
-        ).input_features.to(self.model.device)
-        with torch.no_grad():
-            ids = self.model.generate(inputs, max_new_tokens=440)[0]
-        txt = self.processor.decode(ids)
-        norm = getattr(self.processor.tokenizer, "_normalize", None)
-        return norm(txt) if callable(norm) else txt
-    def transcribe_long_audio(
-        self,
-        wav_path: str,
-        chunk_length_s: int = 20,
-        overlap_s: int = 2,
-    ) -> str:
-        waveform, sr = torchaudio.load(wav_path)
-        total_samples = waveform.shape[1]
-        chunk_size = chunk_length_s * sr
-        overlap_size = overlap_s * sr
-        transcriptions = []
-        start = 0
-        while start < total_samples:
-            end = min(start + chunk_size, total_samples)
-            chunk = waveform[:, start:end]
-            input_features = self.processor(
-                chunk.numpy(),
-                sampling_rate=sr,
-                return_tensors="pt"
-            ).input_features.to(self.model.device)
-            with torch.no_grad():
-                predicted_ids = self.model.generate(
-                    input_features,
-                    max_new_tokens=440,
-                    num_beams=1,   # puedes probar beam search
-                )[0]
-            text = self.processor.decode(predicted_ids, skip_special_tokens=True)
-            transcriptions.append(text.strip())
-            # avanzar con solapamiento
-            start += chunk_size - overlap_size
-        return " ".join(transcriptions).strip()
-@dataclass
-class WhisperASR:
-    """Lightweight generic ASR based on Whisper for non-Catalan languages."""
-    model_name: str = "openai/whisper-small"  # change to 'base' for an even lighter model
-    device: str = "cuda"
-    language: Optional[str] = None  # force language, e.g. "es", "en", etc.
-    def __post_init__(self):
-        dev = self.device
-        if dev == "cuda" and not torch.cuda.is_available():
-            dev = "cpu"
-        self.processor = WhisperProcessor.from_pretrained(self.model_name)
-        self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(dev)
-        self.device = dev
-        log.info(f"ASR Whisper loaded on {self.device}: {self.model_name} (lang hint: {self.language})")
-    def transcribe_wav(self, wav_path: str) -> str:
-        waveform, sr = torchaudio.load(wav_path)
-        inputs = self.processor(
-            waveform.numpy(), sampling_rate=sr, return_tensors="pt"
-        ).input_features.to(self.model.device)
-        gen_kwargs: Dict[str, Any] = dict(max_new_tokens=444)
-        if self.language and self.language != "auto":
-            try:
-                forced_ids = self.processor.get_decoder_prompt_ids(
-                    language=self.language, task="transcribe"
-                )
-                gen_kwargs["forced_decoder_ids"] = forced_ids
-            except Exception:
-                # If the model/processor does not support forced ids, continue without forcing
-                pass
-        with torch.no_grad():
-            ids = self.model.generate(inputs, **gen_kwargs)[0]
-        txt = self.processor.decode(ids)
-        norm = getattr(self.processor.tokenizer, "_normalize", None)
-        return norm(txt) if callable(norm) else txt
-# ------------------------------ Language ID ----------------------------------
-@dataclass
-class WhisperLIDConfig:
-    """Configuration for language detection with faster-whisper."""
-    model_name: str = "Systran/faster-whisper-small"
-    device: str = "auto"
-    compute_type: str = "float32"  # "int8" | "float16" | "float32"
-    beam_size: int = 1
-    chunk_seconds: float = 30.0
-    prob_threshold: float = 0.5
-    fallback_lang: str = "auto"
-def detect_language_with_whisper(
-    wav_path: str,
-    cfg: Dict[str, Any],
-) -> Tuple[str, float]:
-    """
-    Detects language using faster-whisper (WhisperModel). Returns (lang_iso, prob).
-    In case of failure, returns (fallback_lang, 0.0).
-    """
-    lid_cfg_d = (cfg.get("asr", {})
-                 .get("language_detection", {})
-                 .get("whisper_lid", {}))
-    lid_cfg = WhisperLIDConfig(
-        model_name=lid_cfg_d.get("model_name", "Systran/faster-whisper-small",),
-        device=_pick_device_auto(lid_cfg_d.get("device", "auto")),
-        compute_type=lid_cfg_d.get("compute_type", "float32"),
-        beam_size=int(lid_cfg_d.get("beam_size", 1)),
-        chunk_seconds=float(lid_cfg_d.get("chunk_seconds", 30.0)),
-        prob_threshold=float(lid_cfg_d.get("prob_threshold", 0.5)),
-        fallback_lang=lid_cfg_d.get("fallback_lang", "auto"),
-    )
-    try:
-        from faster_whisper import WhisperModel  # type: ignore
-    except Exception as e:
-        log.warning(f"LID: faster-whisper not available ({e}). Fallback='{lid_cfg.fallback_lang}'")
-        return lid_cfg.fallback_lang, 0.0
-    try:
-        model = WhisperModel(lid_cfg.model_name, device=lid_cfg.device, compute_type=lid_cfg.compute_type)
-    except Exception as e:
-        log.warning(f"LID: failed to load '{lid_cfg.model_name}': {e}. Fallback='{lid_cfg.fallback_lang}'")
-        return lid_cfg.fallback_lang, 0.0
-    try:
-        segments, info = model.transcribe(
-            wav_path,
-            beam_size=lid_cfg.beam_size,
-            vad_filter=True,
-            without_timestamps=True,
-            language=None
-        )
-        lang = info.language or lid_cfg.fallback_lang
-        prob = float(info.language_probability or 0.0)
-        if prob < lid_cfg.prob_threshold:
-            return lid_cfg.fallback_lang, prob
-        return lang, prob
-    except Exception as e:
-        log.warning(f"LID: error in transcription/detection: {e}. Fallback='{lid_cfg.fallback_lang}'")
-        return lid_cfg.fallback_lang, 0.0
-def _build_asr_backend_for_language(lang_iso: str, cfg: Dict[str, Any]):
-    """
-    Selects ASR backend based on language:
-      - 'ca' -> AINA
-      - other -> Generic Whisper
-    """
-    asr_cfg = cfg.get("asr", {})
-    device_pref = _pick_device_auto(asr_cfg.get("device", "auto"))
-    if lang_iso and lang_iso.lower() == "ca":
-        return AinaASR(
-            model_name=asr_cfg.get("model_name", "projecte-aina/whisper-large-v3-ca-3catparla"),
-            device=device_pref,
-        )
-    else:
-        return WhisperASR(
-            model_name=asr_cfg.get("whisper_model_name", "openai/whisper-small"),
-            device=device_pref,
-            language=None,
-        )
-# -------------------------------- Diarization --------------------------------
-def diarize_audio(
-    wav_path: str,
-    base_dir: Path,
-    clips_folder: str = "clips",
-    min_segment_duration: float = 20,
-    max_segment_duration: float = 50.0,
-    hf_token_env: str | None = None,
-) -> Tuple[List[str], List[Dict[str, Any]]]:
-    """Diarization with pyannote and clip export with pydub.
-    Returns clip paths and segments [{'start','end','speaker'}].
-    """
-    audio = AudioSegment.from_wav(wav_path)
-    duration = len(audio) / 1000.0
-    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=(hf_token_env
-     or os.getenv("HF_TOKEN")))
-    diarization = pipeline(wav_path)
-    clips_dir = (base_dir / clips_folder)
-    clips_dir.mkdir(parents=True, exist_ok=True)
-    clip_paths: List[str] = []
-    segments: List[Dict[str, Any]] = []
-    spk_map: Dict[str, int] = {}
-    prev_end = 0.0  # referencia al final del último segmento exportado
-    for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
-        start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
-        if start < prev_end:
-            start = prev_end
-        if end <= start:
-            continue
-        seg_dur = end - start
-        if seg_dur < min_segment_duration:
-            continue
-        if seg_dur > max_segment_duration:
-            # split long segments
-            n = int(math.ceil(seg_dur / max_segment_duration))
-            sub_d = seg_dur / n
-            for j in range(n):
-                s = start + j * sub_d
-                e = min(end, start + (j + 1) * sub_d)
-                if e <= s:
-                    continue
-                clip = audio[int(s * 1000):int(e * 1000)]
-                cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
-                clip.export(cp, format="wav")
-                if speaker not in spk_map:
-                    spk_map[speaker] = len(spk_map)
-                segments.append({
-                    "start": s,
-                    "end": e,
-                    "speaker": f"SPEAKER_{spk_map[speaker]:02d}"
-                })
-                clip_paths.append(str(cp))
-                prev_end = e
-        else:
-            clip = audio[int(start * 1000):int(end * 1000)]
-            cp = clips_dir / f"segment_{i:03d}.wav"
-            clip.export(cp, format="wav")
-            if speaker not in spk_map:
-                spk_map[speaker] = len(spk_map)
-            segments.append({
-                "start": start,
-                "end": end,
-                "speaker": f"SPEAKER_{spk_map[speaker]:02d}"
-            })
-            clip_paths.append(str(cp))
-            prev_end = end  # actualizar referencia
-    if not segments:
-        # fallback single clip
-        cp = clips_dir / "segment_000.wav"
-        audio.export(cp, format="wav")
-        return [str(cp)], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
-    # sort by start time
-    pairs = sorted(zip(clip_paths, segments), key=lambda x: x[1]["start"])
-    clip_paths, segments = [p[0] for p in pairs], [p[1] for p in pairs]
-    return clip_paths, segments
-# ------------------------------ Voice embeddings -----------------------------
-class VoiceEmbedder:
-    def __init__(self):
-        self.model = SpeakerRecognition.from_hparams(
-            source="speechbrain/spkrec-ecapa-voxceleb",
-            savedir="pretrained_models/spkrec-ecapa-voxceleb",
-        )
-        self.model.eval()
-    def embed(self, wav_path: str) -> List[float]:
-        waveform, sr = torchaudio.load(wav_path)
-        target_sr = 16000
-        if sr != target_sr:
-            waveform = T.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
-        if waveform.shape[0] > 1:
-            waveform = waveform.mean(dim=0, keepdim=True)
-        # ensure minimum length (~0.2s) for stability
-        min_samples = int(0.2 * target_sr)
-        if waveform.shape[1] < min_samples:
-            pad = min_samples - waveform.shape[1]
-            waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
-        with torch.no_grad():
-            emb = (
-                self.model.encode_batch(waveform)
-                .squeeze()
-                .cpu()
-                .numpy()
-                .astype(float)
-            )
-        return emb.tolist()
-def embed_voice_segments(clip_paths: List[str]) -> List[List[float]]:
-    ve = VoiceEmbedder()
-    out: List[List[float]] = []
-    for cp in clip_paths:
-        try:
-            out.append(ve.embed(cp))
-        except Exception as e:
-            log.warning(f"Embedding error in {cp}: {e}")
-            out.append([])
-    return out
-# --------------------------- Speaker identification --------------------------
-def identify_speakers(
-    embeddings: List[List[float]],
-    voice_collection,  # ChromaDB collection with .query or None
-    cfg: Dict[str, Any],
-) -> List[str]:
-    voice_cfg = cfg.get("voice_processing", {}).get("speaker_identification", {})
-    if not embeddings or sum(1 for e in embeddings if e) < 2:
-        return ["SPEAKER_00" for _ in embeddings]
-    valid = [e for e in embeddings if e and len(e) > 0]
-    if len(valid) < 2:
-        return ["SPEAKER_00" for _ in embeddings]
-    min_clusters = max(1, voice_cfg.get("min_speakers", 1))
-    max_clusters = min(voice_cfg.get("max_speakers", 5), len(valid) - 1)
-    # buscar k óptimo usando silhouette_score
-    if voice_cfg.get("find_optimal_clusters", True) and len(valid) > 2:
-        best_score = -1.0
-        best_k = min_clusters
-        for k in range(min_clusters, max_clusters + 1):
-            if k >= len(valid):
-                break
-            km = KMeans(n_clusters=k, random_state=42, n_init="auto")
-            labels = km.fit_predict(valid)
-            if len(set(labels)) > 1:
-                score = silhouette_score(valid, labels)
-                if score > best_score:
-                    best_score, best_k = score, k
-    else:
-        best_k = min(max_clusters, max(min_clusters, voice_cfg.get("num_speakers", 2)))
-    best_k = max(1, min(best_k, len(valid) - 1))
-    # clustering final
-    km = KMeans(n_clusters=best_k, random_state=42, n_init="auto", init="k-means++")
-    labels = km.fit_predict(np.array(valid))
-    centers = km.cluster_centers_
-    cluster_to_name: Dict[int, str] = {}
-    unknown_counter = 0
-    for cid in range(best_k):
-        center = centers[cid].tolist()
-        name = f"SPEAKER_{cid:02d}"
-        if voice_collection is not None:
-            try:
-                q = voice_collection.query(query_embeddings=[center], n_results=1)
-                metas = q.get("metadatas", [[]])[0]
-                dists = q.get("distances", [[]])[0]
-                thr = voice_cfg.get("distance_threshold")
-                if dists and thr is not None and dists[0] > thr:
-                    # nuevo hablante → marcar como UNKNOWN y guardar en la colección
-                    name = f"UNKNOWN_{unknown_counter}"
-                    unknown_counter += 1
-                    voice_collection.add(
-                        embeddings=[center],
-                        metadatas=[{"name": name}],
-                        ids=[f"unk_{cid}_{unknown_counter}"]
-                    )
-                else:
-                    # coincidencia aceptable → usar nombre existente
-                    if metas and isinstance(metas[0], dict):
-                        name = metas[0].get("nombre") or metas[0].get("name") \
-                               or metas[0].get("speaker") or metas[0].get("identity") \
-                               or name
-            except Exception as e:
-                log.warning(f"Voice KNN query failed: {e}")
-        cluster_to_name[cid] = name
-    # mapear cada embedding a su hablante
-    personas: List[str] = []
-    vi = 0
-    for emb in embeddings:
-        if not emb:
-            personas.append("UNKNOWN")
-        else:
-            label = int(labels[vi])
-            personas.append(cluster_to_name.get(label, f"SPEAKER_{label:02d}"))
-            vi += 1
-    return personas
-# ----------------------------------- SRT -------------------------------------
-def _fmt_srt_time(seconds: float) -> str:
-    h = int(seconds // 3600)
-    m = int((seconds % 3600) // 60)
-    s = int(seconds % 60)
-    ms = int(round((seconds - int(seconds)) * 1000))
-    return f"{h:02}:{m:02}:{s:02},{ms:03}"
-def generate_srt_from_diarization(
-    diarization_segments: List[Dict[str, Any]],
-    transcriptions: List[str],
-    speakers_per_segment: List[str],
-    output_srt_path: str,
-    cfg: Dict[str, Any],
-) -> None:
-    subs = cfg.get("subtitles", {})
-    max_cpl = int(subs.get("max_chars_per_line", 42))
-    max_lines = int(subs.get("max_lines_per_cue", 10))
-    speaker_display = subs.get("speaker_display", "brackets")
-    items: List[Dict[str, Any]] = []
-    n = min(len(diarization_segments), len(transcriptions), len(speakers_per_segment))
-    for i in range(n):
-        seg = diarization_segments[i]
-        text = (transcriptions[i] or "").strip()
-        spk = speakers_per_segment[i]
-        items.append(
-            {
-                "start": float(seg.get("start", 0.0)),
-                "end": float(seg.get("end", 0.0)),
-                "text": text,
-                "speaker": spk,
-            }
-        )
-    out = Path(output_srt_path)
-    out.parent.mkdir(parents=True, exist_ok=True)
-    with out.open("w", encoding="utf-8-sig") as f:
-        for i, it in enumerate(items, 1):
-            text = it["text"]
-            spk = it["speaker"]
-            if speaker_display == "brackets" and spk:
-                text = f"[{spk}]: {text}" # Adjusted format to match new script's style
-            elif speaker_display == "prefix" and spk:
-                text = f"{spk}: {text}"
-            # wrap simple
-            words = text.split()
-            lines: List[str] = []
-            cur = ""
-            for w in words:
-                if len(cur) + len(w) + (1 if cur else 0) <= max_cpl:
-                    cur = (cur + " " + w) if cur else w
-                else:
-                    lines.append(cur)
-                    cur = w
-                    if len(lines) >= max_lines - 1:
-                        break
-            if cur and len(lines) < max_lines:
-                lines.append(cur)
-            f.write(f"{i}\n{_fmt_srt_time(it['start'])} --> {_fmt_srt_time(it['end'])}\n")
-            f.write("\n".join(lines) + "\n\n")
-# ------------------------------ Orchestrator ---------------------------------
-def process_audio_for_video(
-    video_path: str,
-    out_dir: Path,
-    cfg: Dict[str, Any],
-    voice_collection=None,
-) -> Tuple[List[Dict[str, Any]], Optional[str]]:
-    """
-    Audio pipeline: FFmpeg -> diarization -> LID -> ASR -> embeddings -> speaker-ID -> SRT.
-    Returns (audio_segments, srt_path or None).
-    """
-    # 1) Audio extraction
-    audio_cfg = cfg.get("audio_processing", {})
-    sr = int(audio_cfg.get("sample_rate", 16000))
-    fmt = audio_cfg.get("format", "wav")
-    wav_path = extract_audio_ffmpeg(
-        video_path, out_dir / f"{Path(video_path).stem}.{fmt}", sr=sr
-    )
-    log.info("Audio extraído")
-    # 2) Diarización
-    diar_cfg = audio_cfg.get("diarization", {})
-    min_dur = float(diar_cfg.get("min_segment_duration", 0.5))
-    max_dur = float(diar_cfg.get("max_segment_duration", 10.0))
-    clip_paths, diar_segs = diarize_audio(
-        wav_path, out_dir, "clips", min_dur, max_dur
-    )
-    log.info("Clips de audio generados.")
-    # 3) Detección de idioma (opcional) + Selección de backend ASR
-    asr_cfg = cfg.get("asr", {})
-    lid_enabled = bool(asr_cfg.get("language_detection", {}).get("enabled", True))
-    device_pref = _pick_device_auto(asr_cfg.get("device", "auto"))
-    aina_asr = AinaASR(model_name=asr_cfg.get("model_name", "projecte-aina/whisper-large-v3-ca-3catparla"),
-                       device=device_pref)
-    whisper_asr = WhisperASR(model_name=asr_cfg.get("whisper_model_name", "openai/whisper-small"),
-                             device=device_pref,
-                             language=None)
-    full_transcription = ""
-    if asr_cfg.get("enable_full_transcription", True):
-        log.info("Iniciando transcripción del audio completo")
-        # Assume Catalan model for full transcription, or add logic to check language
-        full_transcription = aina_asr.transcribe_long_audio(wav_path, chunk_length_s=30)
-        log.info("Transcripción completa del audio finalizada.")
-        print(full_transcription)
-    # Transcribe each segment
-    log.info("Comenzamos con la transcripción de cada clip.")
-    trans: List[str] = []
-    detected_langs: List[str] = []
-    detected_probs: List[float] = []
-    for path in clip_paths:
-        if not lid_enabled:
-            txt = aina_asr.transcribe_wav(path)
-        else:
-            detected_lang, detected_prob = detect_language_with_whisper(path, cfg)
-            log.info(f"LID: detected={detected_lang} (p={detected_prob:.2f})")
-            if detected_lang.lower() in ["ca", "catalan"]:
-                txt = aina_asr.transcribe_wav(path)
-            else:
-                txt = whisper_asr.transcribe_wav(path)
-        trans.append(txt)
-    log.info("Se han transcrito todos los clips.")
-    # 5) Embeddings + Identificación de hablantes
-    if audio_cfg.get("enable_voice_embeddings", True):
-        embeddings = embed_voice_segments(clip_paths)
-        log.info("Embeddings creados de manera correcta para cada clip.")
-    else:
-        embeddings = [[] for _ in clip_paths]
-    if cfg.get("voice_processing", {}).get("speaker_identification", {}).get("enabled", True):
-        speakers = identify_speakers(embeddings, voice_collection, cfg)
-        log.info("Speakers identificados de manera correcta.")
-    else:
-        speakers = [seg.get("speaker", f"SPEAKER_{i:02d}") for i, seg in enumerate(diar_segs)]
-    # 6) Construir tabla de segmentos
-    audio_segments: List[Dict[str, Any]] = []
-    for i, seg in enumerate(diar_segs):
-        audio_segments.append(
-            {
-                "segment": i,
-                "start": float(seg.get("start", 0.0)),
-                "end": float(seg.get("end", 0.0)),
-                "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
-                "text": trans[i] if i < len(trans) else "",
-                "voice_embedding": embeddings[i],
-                "clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
-                "lang": detected_langs[i] if i < len(detected_langs) else "auto",
-                "lang_prob": detected_probs[i] if i < len(detected_probs) else 0.0,
-            }
-        )
-    # 7) SRT
-    srt_base_path = out_dir / f"transcripcion_diarizada_{Path(video_path).stem}"
-    srt_unmodified_path = str(srt_base_path) + "_unmodified.srt"
-    # Generate initial SRT
-    try:
-        generate_srt_from_diarization(
-            diar_segs,
-            [a["text"] for a in audio_segments],
-            [a["speaker"] for a in audio_segments],
-            srt_unmodified_path,
-            cfg,
-        )
-    except Exception as e:
-        log.warning(f"SRT generation failed: {e}")
-        srt_unmodified_path = None
-    return audio_segments,  srt_unmodified_path, full_transcription
-# ----------------------------------- CLI -------------------------------------
-if __name__ == "__main__":
-    import argparse
-    import yaml
-    ap = argparse.ArgumentParser(description="Veureu — Audio tools (self-contained)")
-    ap.add_argument("--video", required=True)
-    ap.add_argument("--out", default="results")
-    ap.add_argument("--config", default="configs/config_veureu.yaml")
-    args = ap.parse_args()
-    cfg: Dict[str, Any] = {}
-    p = Path(args.config)
-    if p.exists():
-        try:
-            cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
-        except Exception as e:
-            log.warning(f"No se pudo leer el YAML de config: {e}")
-    out_dir = Path(args.out) / Path(args.video).stem
-    out_dir.mkdir(parents=True, exist_ok=True)
-    # Aggiungi una chiave API di OpenAI al tuo file di configurazione o qui
-    # Esempio: cfg["api_keys"] = {"openai": "sk-your-openai-api-key"}
-    # Assicurati di non commettere la chiave in git!
-    segs, srt = process_audio_for_video(args.video, out_dir, cfg, voice_collection=None)
-    print(json.dumps(
-        {
-            "segments": len(segs),
-            "srt": srt,
-            "detected_lang": (segs[0].get("lang") if segs else "auto"),
-            "detected_prob": (segs[0].get("lang_prob") if segs else 0.0),
-        },
-        indent=2,
-        ensure_ascii=False,
-    ))

+# audio_tools.py (ASR delegated to remote HF Space "veureu/asr")
+# -----------------------------------------------------------------------------
+# Veureu — AUDIO utilities (orchestrator w/ remote ASR)
+#  - FFmpeg extraction (WAV)
+#  - Diarization (pyannote)              [local]
+#  - Voice embeddings (SpeechBrain ECAPA) [local]
+#  - Speaker identification (KMeans + ChromaDB optional) [local]
+#  - ASR: delegated to HF Space `veureu/asr` (faster-whisper-large-v3-ca-3catparla)
+#  - SRT generation
+#  - Orchestrator: process_audio_for_video(...)
+# -----------------------------------------------------------------------------
+from __future__ import annotations
+import json
+import logging
+import math
+import os
+import shlex
+import subprocess
+from pathlib import Path
+from typing import List, Dict, Any, Tuple, Optional
+import numpy as np
+# Optional torchaudio for I/O and resampling (fallback to soundfile+librosa otherwise)
+try:
+    import torch
+    import torchaudio as ta
+    import torchaudio.transforms as T
+    HAS_TORCHAUDIO = True
+    try:
+        ta.set_audio_backend("soundfile")
+    except Exception:
+        pass
+except Exception:
+    HAS_TORCHAUDIO = False
+    ta = None  # type: ignore
+import soundfile as sf
+# Pyannote for diarization (local)
+from pyannote.audio import Pipeline
+# Speaker embeddings (local)
+from speechbrain.inference import SpeakerRecognition  # v1.0+
+# Clustering
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+# Router to remote Spaces (asr)
+from llm_router import load_yaml, LLMRouter
+# -------------------------------- Logging ------------------------------------
+log = logging.getLogger("audio_tools")
+if not log.handlers:
+    _h = logging.StreamHandler()
+    _h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+    log.addHandler(_h)
+log.setLevel(logging.INFO)
+# ------------------------------- Utilities -----------------------------------
+def load_wav(path: str | Path, sr: int = 16000):
+    """Load audio as mono float32 at the requested sample rate."""
+    if HAS_TORCHAUDIO:
+        wav, in_sr = ta.load(str(path))
+        if in_sr != sr:
+            wav = ta.functional.resample(wav, in_sr, sr)
+        if wav.dim() > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        return wav.squeeze(0).numpy(), sr
+    import librosa
+    y, in_sr = sf.read(str(path), dtype="float32", always_2d=False)
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+    if in_sr != sr:
+        y = librosa.resample(y, orig_sr=in_sr, target_sr=sr)
+    return y.astype(np.float32), sr
+def save_wav(path: str | Path, y, sr: int = 16000):
+    """Save mono float32 wav."""
+    if HAS_TORCHAUDIO:
+        import torch
+        wav = torch.from_numpy(np.asarray(y, dtype=np.float32)).unsqueeze(0)
+        ta.save(str(path), wav, sr)
+    else:
+        sf.write(str(path), np.asarray(y, dtype=np.float32), sr)
+def extract_audio_ffmpeg(
+    video_path: str,
+    audio_out: Path,
+    sr: int = 16000,
+    mono: bool = True,
+) -> str:
+    """Extract audio from video to WAV using ffmpeg."""
+    audio_out.parent.mkdir(parents=True, exist_ok=True)
+    cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
+    subprocess.run(
+        shlex.split(cmd),
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return str(audio_out)
+# ----------------------------------- ASR (REMOTE) -------------------------------------
+def transcribe_audio_remote(audio_path: str | Path, cfg: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Send the audio file to the remote ASR Space `veureu/asr` (Gradio or HTTP).
+    The remote model is 'faster-whisper-large-v3-ca-3catparla' (Aina).
+    Returns standardized dict: {'text': str, 'segments': list?}
+    """
+    if not cfg:
+        cfg = load_yaml("config.yaml")
+    router = LLMRouter(cfg)
+    model_name = (cfg.get("models", {}).get("asr") or "whisper-catalan")
+    params = {
+        "language": "ca",
+        "model": "faster-whisper-large-v3-ca-3catparla",
+        "timestamps": True,
+        "diarization": False,  # diarization stays local
+    }
+    result = router.asr_transcribe(str(audio_path), model=model_name, **params)
+    if isinstance(result, str):
+        return {"text": result, "segments": []}
+    if isinstance(result, dict):
+        if "text" not in result and "transcription" in result:
+            result["text"] = result["transcription"]
+        result.setdefault("segments", [])
+        return result
+    return {"text": str(result), "segments": []}
+# -------------------------------- Diarization --------------------------------
+def diarize_audio(
+    wav_path: str,
+    base_dir: Path,
+    clips_folder: str = "clips",
+    min_segment_duration: float = 20.0,
+    max_segment_duration: float = 50.0,
+    hf_token_env: str | None = None,
+) -> Tuple[List[str], List[Dict[str, Any]]]:
+    """Diarization with pyannote and clip export with pydub."""
+    from pydub import AudioSegment
+    audio = AudioSegment.from_wav(wav_path)
+    duration = len(audio) / 1000.0
+    pipeline = Pipeline.from_pretrained(
+        "pyannote/speaker-diarization-3.1",
+        use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
+    )
+    diarization = pipeline(wav_path)
+    clips_dir = (base_dir / clips_folder)
+    clips_dir.mkdir(parents=True, exist_ok=True)
+    clip_paths: List[str] = []
+    segments: List[Dict[str, Any]] = []
+    spk_map: Dict[str, int] = {}
+    prev_end = 0.0
+    for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
+        start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
+        if start < prev_end:
+            start = prev_end
+        if end <= start:
+            continue
+        seg_dur = end - start
+        if seg_dur < min_segment_duration:
+            continue
+        if seg_dur > max_segment_duration:
+            n = int(math.ceil(seg_dur / max_segment_duration))
+            sub_d = seg_dur / n
+            for j in range(n):
+                s = start + j * sub_d
+                e = min(end, start + (j + 1) * sub_d)
+                if e <= s:
+                    continue
+                clip = audio[int(s * 1000):int(e * 1000)]
+                cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
+                clip.export(cp, format="wav")
+                if speaker not in spk_map:
+                    spk_map[speaker] = len(spk_map)
+                segments.append({"start": s, "end": e, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
+                clip_paths.append(str(cp))
+                prev_end = e
+        else:
+            clip = audio[int(start * 1000):int(end * 1000)]
+            cp = clips_dir / f"segment_{i:03d}.wav"
+            clip.export(cp, format="wav")
+            if speaker not in spk_map:
+                spk_map[speaker] = len(spk_map)
+            segments.append({"start": start, "end": end, "speaker": f"SPEAKER_{spk_map[speaker]:02d}"})
+            clip_paths.append(str(cp))
+            prev_end = end
+    if not segments:
+        cp = clips_dir / "segment_000.wav"
+        audio.export(cp, format="wav")
+        return [str(cp)], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
+    pairs = sorted(zip(clip_paths, segments), key=lambda x: x[1]["start"])
+    clip_paths, segments = [p[0] for p in pairs], [p[1] for p in pairs]
+    return clip_paths, segments
+# ------------------------------ Voice embeddings -----------------------------
+class VoiceEmbedder:
+    def __init__(self):
+        self.model = SpeakerRecognition.from_hparams(
+            source="speechbrain/spkrec-ecapa-voxceleb",
+            savedir="pretrained_models/spkrec-ecapa-voxceleb",
+        )
+        self.model.eval()
+    def embed(self, wav_path: str) -> List[float]:
+        if HAS_TORCHAUDIO:
+            waveform, sr = ta.load(wav_path)
+            target_sr = 16000
+            if sr != target_sr:
+                waveform = T.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
+            if waveform.shape[0] > 1:
+                waveform = waveform.mean(dim=0, keepdim=True)
+            min_samples = int(0.2 * target_sr)
+            if waveform.shape[1] < min_samples:
+                pad = min_samples - waveform.shape[1]
+                import torch
+                waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
+            with torch.no_grad():  # type: ignore
+                emb = self.model.encode_batch(waveform).squeeze().cpu().numpy().astype(float)
+            return emb.tolist()
+        else:
+            y, sr = load_wav(wav_path, sr=16000)
+            min_len = int(0.2 * 16000)
+            if len(y) < min_len:
+                y = np.pad(y, (0, min_len - len(y)))
+            import torch
+            w = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
+            with torch.no_grad():  # type: ignore
+                emb = self.model.encode_batch(w).squeeze().cpu().numpy().astype(float)
+            return emb.tolist()
+def embed_voice_segments(clip_paths: List[str]) -> List[List[float]]:
+    ve = VoiceEmbedder()
+    out: List[List[float]] = []
+    for cp in clip_paths:
+        try:
+            out.append(ve.embed(cp))
+        except Exception as e:
+            log.warning(f"Embedding error in {cp}: {e}")
+            out.append([])
+    return out
+# --------------------------- Speaker identification --------------------------
+def identify_speakers(
+    embeddings: List[List[float]],
+    voice_collection,
+    cfg: Dict[str, Any],
+) -> List[str]:
+    voice_cfg = cfg.get("voice_processing", {}).get("speaker_identification", {})
+    if not embeddings or sum(1 for e in embeddings if e) < 2:
+        return ["SPEAKER_00" for _ in embeddings]
+    valid = [e for e in embeddings if e and len(e) > 0]
+    if len(valid) < 2:
+        return ["SPEAKER_00" for _ in embeddings]
+    min_clusters = max(1, int(voice_cfg.get("min_speakers", 1)))
+    max_clusters = min(int(voice_cfg.get("max_speakers", 5)), len(valid) - 1)
+    if voice_cfg.get("find_optimal_clusters", True) and len(valid) > 2:
+        best_score, best_k = -1.0, min_clusters
+        for k in range(min_clusters, max_clusters + 1):
+            if k >= len(valid):
+                break
+            km = KMeans(n_clusters=k, random_state=42, n_init="auto")
+            labels = km.fit_predict(valid)
+            if len(set(labels)) > 1:
+                score = silhouette_score(valid, labels)
+                if score > best_score:
+                    best_score, best_k = score, k
+    else:
+        best_k = min(max_clusters, max(min_clusters, int(voice_cfg.get("num_speakers", 2))))
+    best_k = max(1, min(best_k, len(valid) - 1))
+    km = KMeans(n_clusters=best_k, random_state=42, n_init="auto", init="k-means++")
+    labels = km.fit_predict(np.array(valid))
+    centers = km.cluster_centers_
+    cluster_to_name: Dict[int, str] = {}
+    unknown_counter = 0
+    for cid in range(best_k):
+        center = centers[cid].tolist()
+        name = f"SPEAKER_{cid:02d}"
+        if voice_collection is not None:
+            try:
+                q = voice_collection.query(query_embeddings=[center], n_results=1)
+                metas = q.get("metadatas", [[]])[0]
+                dists = q.get("distances", [[]])[0]
+                thr = voice_cfg.get("distance_threshold")
+                if dists and thr is not None and dists[0] > thr:
+                    name = f"UNKNOWN_{unknown_counter}"
+                    unknown_counter += 1
+                    voice_collection.add(
+                        embeddings=[center],
+                        metadatas=[{"name": name}],
+                        ids=[f"unk_{cid}_{unknown_counter}"],
+                    )
+                else:
+                    if metas and isinstance(metas[0], dict):
+                        name = metas[0].get("nombre") or metas[0].get("name") \
+                            or metas[0].get("speaker") or metas[0].get("identity") or name
+            except Exception as e:
+                log.warning(f"Voice KNN query failed: {e}")
+        cluster_to_name[cid] = name
+    personas: List[str] = []
+    vi = 0
+    for emb in embeddings:
+        if not emb:
+            personas.append("UNKNOWN")
+        else:
+            label = int(labels[vi])
+            personas.append(cluster_to_name.get(label, f"SPEAKER_{label:02d}"))
+            vi += 1
+    return personas
+# ----------------------------------- SRT -------------------------------------
+def _fmt_srt_time(seconds: float) -> str:
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int(round((seconds - int(seconds)) * 1000))
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+def generate_srt_from_diarization(
+    diarization_segments: List[Dict[str, Any]],
+    transcriptions: List[str],
+    speakers_per_segment: List[str],
+    output_srt_path: str,
+    cfg: Dict[str, Any],
+) -> None:
+    subs = cfg.get("subtitles", {})
+    max_cpl = int(subs.get("max_chars_per_line", 42))
+    max_lines = int(subs.get("max_lines_per_cue", 10))
+    speaker_display = subs.get("speaker_display", "brackets")
+    items: List[Dict[str, Any]] = []
+    n = min(len(diarization_segments), len(transcriptions), len(speakers_per_segment))
+    for i in range(n):
+        seg = diarization_segments[i]
+        text = (transcriptions[i] or "").strip()
+        spk = speakers_per_segment[i]
+        items.append({"start": float(seg.get("start", 0.0)), "end": float(seg.get("end", 0.0)), "text": text, "speaker": spk})
+    out = Path(output_srt_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8-sig") as f:
+        for i, it in enumerate(items, 1):
+            text = it["text"]
+            spk = it["speaker"]
+            if speaker_display == "brackets" and spk:
+                text = f"[{spk}]: {text}"
+            elif speaker_display == "prefix" and spk:
+                text = f"{spk}: {text}"
+            words = text.split()
+            lines: List[str] = []
+            cur = ""
+            for w in words:
+                if len(cur) + len(w) + (1 if cur else 0) <= max_cpl:
+                    cur = (cur + " " + w) if cur else w
+                else:
+                    lines.append(cur)
+                    cur = w
+                    if len(lines) >= max_lines - 1:
+                        break
+            if cur and len(lines) < max_lines:
+                lines.append(cur)
+            f.write(f"{i}\n{_fmt_srt_time(it['start'])} --> {_fmt_srt_time(it['end'])}\n")
+            f.write("\n".join(lines) + "\n\n")
+# ------------------------------ Orchestrator ---------------------------------
+def process_audio_for_video(
+    video_path: str,
+    out_dir: Path,
+    cfg: Dict[str, Any],
+    voice_collection=None,
+) -> Tuple[List[Dict[str, Any]], Optional[str], str]:
+    """
+    Audio pipeline: FFmpeg -> diarization -> remote ASR (full + clips) -> embeddings -> speaker-ID -> SRT.
+    Returns (audio_segments, srt_path or None, full_transcription_text).
+    """
+    audio_cfg = cfg.get("audio_processing", {})
+    sr = int(audio_cfg.get("sample_rate", 16000))
+    fmt = audio_cfg.get("format", "wav")
+    wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.{fmt}", sr=sr)
+    log.info("Audio extraído")
+    diar_cfg = audio_cfg.get("diarization", {})
+    min_dur = float(diar_cfg.get("min_segment_duration", 20.0))
+    max_dur = float(diar_cfg.get("max_segment_duration", 50.0))
+    clip_paths, diar_segs = diarize_audio(wav_path, out_dir, "clips", min_dur, max_dur)
+    log.info("Clips de audio generados.")
+    full_transcription = ""
+    asr_section = cfg.get("asr", {})
+    if asr_section.get("enable_full_transcription", True):
+        log.info("Transcripción completa (remota, Space 'asr')...")
+        full_res = transcribe_audio_remote(wav_path, cfg)
+        full_transcription = full_res.get("text", "") or ""
+        log.info("Transcripción completa finalizada.")
+    log.info("Transcripción por clip (remota, Space 'asr')...")
+    trans: List[str] = []
+    for cp in clip_paths:
+        res = transcribe_audio_remote(cp, cfg)
+        trans.append(res.get("text", ""))
+    log.info("Se han transcrito todos los clips.")
+    embeddings = embed_voice_segments(clip_paths) if audio_cfg.get("enable_voice_embeddings", True) else [[] for _ in clip_paths]
+    if cfg.get("voice_processing", {}).get("speaker_identification", {}).get("enabled", True):
+        speakers = identify_speakers(embeddings, voice_collection, cfg)
+        log.info("Speakers identificados correctamente.")
+    else:
+        speakers = [seg.get("speaker", f"SPEAKER_{i:02d}") for i, seg in enumerate(diar_segs)]
+    audio_segments: List[Dict[str, Any]] = []
+    for i, seg in enumerate(diar_segs):
+        audio_segments.append(
+            {
+                "segment": i,
+                "start": float(seg.get("start", 0.0)),
+                "end": float(seg.get("end", 0.0)),
+                "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
+                "text": trans[i] if i < len(trans) else "",
+                "voice_embedding": embeddings[i],
+                "clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
+                "lang": "ca",
+                "lang_prob": 1.0,
+            }
+        )
+    srt_base_path = out_dir / f"transcripcion_diarizada_{Path(video_path).stem}"
+    srt_unmodified_path = str(srt_base_path) + "_unmodified.srt"
+    try:
+        generate_srt_from_diarization(
+            diar_segs,
+            [a["text"] for a in audio_segments],
+            [a["speaker"] for a in audio_segments],
+            srt_unmodified_path,
+            cfg,
+        )
+    except Exception as e:
+        log.warning(f"SRT generation failed: {e}")
+        srt_unmodified_path = None
+    return audio_segments, srt_unmodified_path, full_transcription

background_descriptor.py CHANGED Viewed

@@ -1,10 +1,118 @@
-# ================================
-# PATCH: background_descriptor.py (describe_keyframes_with_llm)
-# ================================
-# Sustituimos la llamada directa a describe_montage_sequence por router.vision_describe
-# y mantenemos como fallback la función existente.
-# (reemplaza la función en este archivo por esta versión)
 def describe_keyframes_with_llm(
     keyframes: List[Dict[str, Any]],
@@ -12,7 +120,6 @@ def describe_keyframes_with_llm(
     face_identities: Optional[set] = None,
     config_path: str | None = None,
 ) -> Tuple[List[Dict[str, Any]], Optional[str]]:
-    from llm_router import load_yaml, LLMRouter
     cfg = load_yaml(config_path or "config.yaml")
     model_name = (cfg.get("background_descriptor", {}).get("description", {}) or {}).get("model", "salamandra-vision")
@@ -29,15 +136,14 @@ def describe_keyframes_with_llm(
             router = LLMRouter(cfg)
             descs = router.vision_describe(frame_paths, context=context, model=model_name)
         except Exception:
-            # Fallback a implementación local existente si falla el remoto
             descs = describe_montage_sequence(
                 montage_path=str(montage_path),
                 n=len(frame_paths),
                 informacion=keyframes,
                 face_identities=face_identities or set(),
-                config_path=config_path or "config_veureu.yaml",
             )
         for i, fr in enumerate(keyframes):
             if i < len(descs):
                 fr["description"] = descs[i]
-    return keyframes, str(montage_path) if montage_path else None

+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+from vision_tools import (
+    keyframe_conditional_extraction_ana,
+    keyframe_every_second,
+    process_frames,
+    FaceOfImageEmbedding,
+    generar_montage,
+    describe_montage_sequence,   # fallback local
+)
+from llm_router import load_yaml, LLMRouter
+def cluster_ocr_sequential(ocr_list: List[Dict[str, Any]], threshold: float = 0.6) -> List[Dict[str, Any]]:
+    if not ocr_list:
+        return []
+    ocr_text = [item.get("ocr") for item in ocr_list if item and isinstance(item.get("ocr"), str)]
+    if not ocr_text:
+        return []
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = model.encode(ocr_text, normalize_embeddings=True)
+    clusters_repr = []
+    prev_emb = embeddings[0]
+    start_time = ocr_list[0]["start"]
+    for i, emb in enumerate(embeddings[1:], 1):
+        sim = cosine_similarity([prev_emb], [emb])[0][0]
+        if sim < threshold:
+            clusters_repr.append({"index": i - 1, "start_time": start_time})
+            prev_emb = emb
+            start_time = ocr_list[i]["start"]
+    clusters_repr.append({"index": len(embeddings) - 1, "start_time": start_time})
+    ocr_final = []
+    for cluster in clusters_repr:
+        idx = cluster["index"]
+        if idx < len(ocr_list) and ocr_list[idx].get("ocr"):
+            it = ocr_list[idx]
+            ocr_final.append({
+                "ocr": it.get("ocr"),
+                "image_path": it.get("image_path"),
+                "start": cluster["start_time"],
+                "end": it.get("end"),
+                "faces": it.get("faces"),
+            })
+    return ocr_final
+def build_keyframes_and_per_second(
+    video_path: str,
+    out_dir: Path,
+    cfg: Dict[str, Any],
+    face_collection=None,
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
+    kf_dir = out_dir / "keyframes"
+    ps_dir = out_dir / "frames_per_second"
+    keyframes = keyframe_conditional_extraction_ana(video_path=video_path, output_dir=str(kf_dir))
+    per_second = keyframe_every_second(video_path=video_path, output_dir=str(ps_dir))
+    embedder = FaceOfImageEmbedding(deepface_model="Facenet512")
+    kf_proc = process_frames(frames=keyframes, config=cfg, face_col=face_collection, embedding_model=embedder)
+    ps_proc = process_frames(frames=per_second, config=cfg, face_col=face_collection, embedding_model=embedder)
+    ocr_list = [{
+        "ocr": fr.get("ocr"),
+        "image_path": fr.get("image_path"),
+        "start": fr.get("start"),
+        "end": fr.get("end"),
+        "faces": fr.get("faces"),
+    } for fr in ps_proc]
+    ocr_final = cluster_ocr_sequential(ocr_list, threshold=float(cfg.get("video_processing", {}).get("ocr_clustering", {}).get("similarity_threshold", 0.6)))
+    kf_mod: List[Dict[str, Any]] = []
+    idx = 1
+    for k in kf_proc:
+        ks, ke = k["start"], k["end"]
+        inicio = True
+        sustituido = False
+        for f in ocr_final:
+            if f["start"] >= ks and f["end"] <= ke and inicio:
+                kf_mod.append({
+                    "id": idx,
+                    "start": k["start"],
+                    "end": None,
+                    "image_path": f["image_path"],
+                    "faces": f["faces"],
+                    "ocr": f.get("ocr"),
+                    "description": None,
+                })
+                idx += 1
+                sustituido = True
+                inicio = False
+            elif f["start"] >= ks and f["end"] <= ke and not inicio:
+                kf_mod.append({
+                    "id": idx,
+                    "start": f["start"],
+                    "end": None,
+                    "image_path": f["image_path"],
+                    "faces": f["faces"],
+                    "ocr": f.get("ocr"),
+                    "description": None,
+                })
+                idx += 1
+        if not sustituido:
+            k2 = dict(k)
+            k2["id"] = idx
+            kf_mod.append(k2)
+            idx += 1
+    return kf_mod, ps_proc, 0.0
 def describe_keyframes_with_llm(
     keyframes: List[Dict[str, Any]],
     face_identities: Optional[set] = None,
     config_path: str | None = None,
 ) -> Tuple[List[Dict[str, Any]], Optional[str]]:
     cfg = load_yaml(config_path or "config.yaml")
     model_name = (cfg.get("background_descriptor", {}).get("description", {}) or {}).get("model", "salamandra-vision")
             router = LLMRouter(cfg)
             descs = router.vision_describe(frame_paths, context=context, model=model_name)
         except Exception:
             descs = describe_montage_sequence(
                 montage_path=str(montage_path),
                 n=len(frame_paths),
                 informacion=keyframes,
                 face_identities=face_identities or set(),
+                config_path=config_path or "config.yaml",
             )
         for i, fr in enumerate(keyframes):
             if i < len(descs):
                 fr["description"] = descs[i]
+    return keyframes, str(montage_path) if montage_path else None

config.yaml CHANGED Viewed

@@ -3,93 +3,90 @@
 # ===========================
 engine:
-  # Salida de artefactos
   output_root: "results"
-  # Persistencia de índices vectoriales
-  database:
-    enabled: true
-    persist_directory: "chroma_db"
-    enable_face_recognition: true
-    enable_voice_recognition: true
-    face_collection: "index_faces"
-    voice_collection: "index_voices"
-  # Jobs asíncronos (si implementas el patrón de cola)
-  jobs:
-    enabled: true
-    max_workers: 1            # Ajusta según recursos del Space
-    result_ttl_seconds: 86400 # 1 día
 api:
   cors_allow_origins: ["*"]
-  # Tiempo máximo (segundos) de una petición síncrona (si usas el endpoint sync)
   sync_timeout_seconds: 3600
 video_processing:
-  # Metadatos de extracción de frames
   keyframes:
-    # Si tu extractor condicional usa umbrales, puedes incluirlos aquí:
     conditional_extraction:
       enable: true
-      # ejemplos de parámetros (ajústalos a tu extractor real)
       min_scene_length_seconds: 1.5
       difference_threshold: 28.0
   frames_per_second:
     enable: true
-    fps: 1.0  # "frecuencia de frames de análisis" (1 frame/segundo por defecto)
   ocr:
-    engine: "tesseract"   # o "easyocr"
-    language_hint: "spa"  # si aplica
-    # Solo si usas pytesseract:
-    tesseract_cmd: ""     # ruta binaria si no está en PATH
   faces:
-    detector_model: "mtcnn"           # ejemplar; ajústalo a tu vision_tools
-    embedding_model: "Facenet512"     # usado en background_descriptor.FaceOfImageEmbedding
     min_face_size: 32
     detection_confidence: 0.85
   ocr_clustering:
     method: "sequential_similarity"
     sentence_transformer: "all-MiniLM-L6-v2"
-    similarity_threshold: 0.60        # "número de clusters" implícito por umbral (más alto ⇒ menos clusters)
 audio_processing:
-  # El ASR principal será remoto si seleccionas whisper catalan (ver models/routing)
   diarization:
     enabled: true
-    # ejemplo de parámetros de diarización si los usas en audio_tools
-    min_speaker_duration: 0.8
-    max_speakers: 8
   speaker_embedding:
     enabled: true
-    # umbral para asig. de identidad en voice_collection
     speaker_identification:
       distance_threshold: 0.40
-  # Si mantienes transcripción local para otros idiomas/modelos:
-  local_asr:
-    enabled: false   # usarás remoto para whisper catalan
-    model: ""        # (vacío porque lo gestionas vía Spaces)
 background_descriptor:
-  # Parámetros del montaje y descripción con LLM
   montage:
     enable: true
-    max_frames: 12           # tope de frames en el collage/descripcion
-    grid: "auto"             # o 3x4, etc.
   description:
-    model: "salamandra-vision"  # puede ser "salamandra-vision" o "gpt-4o-mini"
     max_tokens: 512
     temperature: 0.2
-    # Si hay identidades detectadas, se pasan como hints (ya lo hace tu pipeline)
 identity:
-  # Reglas de mapeo temporal y enriquecimiento
   timeline_mapping:
     per_second_frames_source: "frames_per_second"
     attach_faces_to:
@@ -100,23 +97,26 @@ identity:
 narration:
   model: "salamandra-instruct"   # "salamandra-instruct" | "gpt-4o-mini"
   une_guidelines_path: "UNE_153010.txt"
-  # Restricciones temporales (para UNE-153010)
   timing:
-    max_ad_duration_ratio: 0.60  # proporción del hueco disponible que puede ocupar la AD
     min_gap_seconds: 1.20
     min_ad_seconds: 0.80
   llm:
     max_tokens: 1024
     temperature: 0.2
 models:
-  # Selección de modelos de alto nivel por tarea
-  instruct: "salamandra-instruct"   # para NarrationSystem y otros textos
-  vision: "salamandra-vision"       # para describir frames/montajes
-  tools: "salamandra-tools"         # si necesitas funciones con tool-calling
-  asr: "whisper-catalan"            # ASR catalán
-  # Enrutado: qué modelos se ejecutan REMOTO (vía otros Spaces)
   routing:
     use_remote_for:
       - "salamandra-instruct"
@@ -125,47 +125,43 @@ models:
       - "whisper-catalan"
 remote_spaces:
-  # Dónde llamar cuando models.routing decide “remoto”
   user: "veureu"
   endpoints:
-    # Nota: rellena las URLs reales cuando publiques los Spaces.
     salamandra-instruct:
       space: "schat"
       base_url: "https://veureu-schat.hf.space"
-      client: "gradio"          # "gradio" o "http"
-      predict_route: "/run/predict"  # si usas gradio_client no necesitas ruta
     salamandra-vision:
       space: "svision"
       base_url: "https://veureu-svision.hf.space"
       client: "gradio"
-      predict_route: "/run/predict"
     salamandra-tools:
       space: "stools"
       base_url: "https://veureu-stools.hf.space"
       client: "gradio"
-      predict_route: "/run/predict"
     whisper-catalan:
-      space: "ars"
-      base_url: "https://veureu-ars.hf.space"
       client: "gradio"
-      predict_route: "/run/predict"
-  # Parámetros de red y robustez
   http:
-    timeout_seconds: 120
     retries: 3
     backoff_seconds: 2.0
 security:
-  # Si necesitas pasar tokens (p. ej., tokens del Hub o auth propia)
   use_hf_token: true
-  hf_token_env: "HF_TOKEN"       # nombre de la variable de entorno para el token
   allow_insecure_tls: false
 logging:
-  level: "INFO"                  # DEBUG | INFO | WARNING | ERROR
   json: false

 # ===========================
 engine:
   output_root: "results"
 api:
   cors_allow_origins: ["*"]
   sync_timeout_seconds: 3600
+database:
+  enabled: true
+  persist_directory: "chroma_db"
+  enable_face_recognition: true
+  enable_voice_recognition: true
+  face_collection: "index_faces"
+  voice_collection: "index_voices"
+jobs:
+  enabled: false    # si activas cola async, cámbialo a true y añade JobManager en main_api.py
+  max_workers: 1
+  result_ttl_seconds: 86400
 video_processing:
   keyframes:
     conditional_extraction:
       enable: true
       min_scene_length_seconds: 1.5
       difference_threshold: 28.0
   frames_per_second:
     enable: true
+    fps: 1.0   # Frecuencia de frames de análisis
   ocr:
+    engine: "tesseract"   # "tesseract" | "easyocr"
+    language_hint: "spa"
+    tesseract_cmd: ""     # si no está en PATH, deja la ruta
   faces:
+    detector_model: "mtcnn"        # ajusta a tu vision_tools
+    embedding_model: "Facenet512"  # usado por FaceOfImageEmbedding
     min_face_size: 32
     detection_confidence: 0.85
   ocr_clustering:
     method: "sequential_similarity"
     sentence_transformer: "all-MiniLM-L6-v2"
+    similarity_threshold: 0.60     # mayor ⇒ menos clusters
 audio_processing:
+  sample_rate: 16000
+  format: "wav"
   diarization:
     enabled: true
+    min_segment_duration: 20.0     # en segundos (post-procesado de turnos)
+    max_segment_duration: 50.0
+  enable_voice_embeddings: true     # SpeechBrain ECAPA
   speaker_embedding:
     enabled: true
+  # Identificación de hablantes (clustering + Chroma)
+  voice_processing:
     speaker_identification:
+      enabled: true
+      find_optimal_clusters: true
+      min_speakers: 1
+      max_speakers: 5
       distance_threshold: 0.40
+asr:
+  # Controla la transcripción del audio completo además de los clips (útil para contexto global)
+  enable_full_transcription: true
 background_descriptor:
   montage:
     enable: true
+    max_frames: 12
+    grid: "auto"
   description:
+    model: "salamandra-vision"  # o "gpt-4o-mini"
     max_tokens: 512
     temperature: 0.2
 identity:
   timeline_mapping:
     per_second_frames_source: "frames_per_second"
     attach_faces_to:
 narration:
   model: "salamandra-instruct"   # "salamandra-instruct" | "gpt-4o-mini"
   une_guidelines_path: "UNE_153010.txt"
   timing:
+    max_ad_duration_ratio: 0.60
     min_gap_seconds: 1.20
     min_ad_seconds: 0.80
   llm:
     max_tokens: 1024
     temperature: 0.2
+subtitles:
+  max_chars_per_line: 42
+  max_lines_per_cue: 10
+  speaker_display: "brackets"  # "brackets" | "prefix" | "none"
 models:
+  # alias de tarea → modelo
+  instruct: "salamandra-instruct"
+  vision: "salamandra-vision"
+  tools: "salamandra-tools"
+  asr: "whisper-catalan"  # apunta al Space veureu/asr (Aina: faster-whisper-large-v3-ca-3catparla)
   routing:
     use_remote_for:
       - "salamandra-instruct"
       - "whisper-catalan"
 remote_spaces:
   user: "veureu"
   endpoints:
     salamandra-instruct:
       space: "schat"
       base_url: "https://veureu-schat.hf.space"
+      client: "gradio"
+      predict_route: "/predict"
     salamandra-vision:
       space: "svision"
       base_url: "https://veureu-svision.hf.space"
       client: "gradio"
+      predict_route: "/predict"
     salamandra-tools:
       space: "stools"
       base_url: "https://veureu-stools.hf.space"
       client: "gradio"
+      predict_route: "/predict"
     whisper-catalan:
+      space: "asr"
+      base_url: "https://veureu-asr.hf.space"
       client: "gradio"
+      predict_route: "/predict"
   http:
+    timeout_seconds: 180
     retries: 3
     backoff_seconds: 2.0
 security:
   use_hf_token: true
+  hf_token_env: "HF_TOKEN"
   allow_insecure_tls: false
 logging:
+  level: "INFO"
   json: false

llm_router.py CHANGED Viewed

@@ -1,22 +1,18 @@
-import os
-# ============================
-# File: llm_router.py
-# ============================
 from __future__ import annotations
 from typing import Any, Dict, List, Optional
 from pathlib import Path
 import yaml
 from remote_clients import InstructClient, VisionClient, ToolsClient, ASRClient
 def load_yaml(path: str) -> Dict[str, Any]:
     p = Path(path)
     if not p.exists():
         return {}
     return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
 class LLMRouter:
     def __init__(self, cfg: Dict[str, Any]):
         self.cfg = cfg
@@ -29,9 +25,9 @@ class LLMRouter:
         def mk(endpoint_key: str, cls):
             info = eps.get(endpoint_key, {})
             base_url = info.get("base_url") or f"https://{base_user}-{info.get('space')}.hf.space"
-            client = cls(base_url=base_url, use_gradio=(info.get("client", "gradio") == "gradio"), hf_token=hf_token,
-                        timeout=int(cfg.get("remote_spaces", {}).get("http", {}).get("timeout_seconds", 120)))
-            return client
         self.clients = {
             "salamandra-instruct": mk("salamandra-instruct", InstructClient),
@@ -44,8 +40,6 @@ class LLMRouter:
     def instruct(self, prompt: str, system: Optional[str] = None, model: str = "salamandra-instruct", **kwargs) -> str:
         if model in self.rem:
             return self.clients[model].generate(prompt, system=system, **kwargs)  # type: ignore
-        # fallback local (p. ej., gpt-4o-mini o gpt-oss vía tu API local si existiera)
-        # Aquí podrías integrar una API OpenAI-compatible si la tienes.
         raise RuntimeError(f"Modelo local no implementado para: {model}")
     # ---- VISION ----

+# llm_router.py — enruta llamadas a Spaces remotos según config.yaml
 from __future__ import annotations
 from typing import Any, Dict, List, Optional
 from pathlib import Path
+import os
 import yaml
 from remote_clients import InstructClient, VisionClient, ToolsClient, ASRClient
 def load_yaml(path: str) -> Dict[str, Any]:
     p = Path(path)
     if not p.exists():
         return {}
     return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
 class LLMRouter:
     def __init__(self, cfg: Dict[str, Any]):
         self.cfg = cfg
         def mk(endpoint_key: str, cls):
             info = eps.get(endpoint_key, {})
             base_url = info.get("base_url") or f"https://{base_user}-{info.get('space')}.hf.space"
+            use_gradio = (info.get("client", "gradio") == "gradio")
+            timeout = int(cfg.get("remote_spaces", {}).get("http", {}).get("timeout_seconds", 180))
+            return cls(base_url=base_url, use_gradio=use_gradio, hf_token=hf_token, timeout=timeout)
         self.clients = {
             "salamandra-instruct": mk("salamandra-instruct", InstructClient),
     def instruct(self, prompt: str, system: Optional[str] = None, model: str = "salamandra-instruct", **kwargs) -> str:
         if model in self.rem:
             return self.clients[model].generate(prompt, system=system, **kwargs)  # type: ignore
         raise RuntimeError(f"Modelo local no implementado para: {model}")
     # ---- VISION ----

scripts/remote_clients.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# remote_clients.py — clientes para Spaces remotos (Gradio/HTTP)
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+import os, json, requests
+from tenacity import retry, stop_after_attempt, wait_exponential
+try:
+    from gradio_client import Client as GradioClient
+except Exception:
+    GradioClient = None  # type: ignore
+class BaseRemoteClient:
+    def __init__(self, base_url: str, use_gradio: bool = True, hf_token: Optional[str] = None, timeout: int = 180):
+        self.base_url = base_url.rstrip("/")
+        self.use_gradio = use_gradio and GradioClient is not None
+        self.hf_token = hf_token or os.getenv("HF_TOKEN")
+        self.timeout = timeout
+        self._client = None
+        if self.use_gradio:
+            headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else None
+            self._client = GradioClient(self.base_url, hf_token=self.hf_token, headers=headers)
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
+    def _post_json(self, route: str, payload: Dict[str, Any]) -> Dict[str, Any]:
+        url = f"{self.base_url}{route}"
+        headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else {}
+        r = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
+        r.raise_for_status()
+        return r.json()
+class InstructClient(BaseRemoteClient):
+    def generate(self, prompt: str, system: Optional[str] = None, **kwargs) -> str:
+        if self.use_gradio and self._client:
+            out = self._client.predict(prompt, api_name="/predict")
+            return str(out)
+        data = {"prompt": prompt, "system": system, **kwargs}
+        res = self._post_json("/generate", data)
+        return res.get("text", "")
+class VisionClient(BaseRemoteClient):
+    def describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, **kwargs) -> List[str]:
+        if self.use_gradio and self._client:
+            out = self._client.predict(image_paths, json.dumps(context or {}), api_name="/predict")
+            if isinstance(out, str):
+                try:
+                    return json.loads(out)
+                except Exception:
+                    return [out]
+            return list(out)
+        data = {"images": image_paths, "context": context or {}, **kwargs}
+        res = self._post_json("/describe", data)
+        return res.get("descriptions", [])
+class ToolsClient(BaseRemoteClient):
+    def chat(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, **kwargs) -> Dict[str, Any]:
+        if self.use_gradio and self._client:
+            out = self._client.predict(json.dumps(messages), json.dumps(tools or []), api_name="/predict")
+            if isinstance(out, str):
+                try:
+                    return json.loads(out)
+                except Exception:
+                    return {"text": out}
+            return out
+        data = {"messages": messages, "tools": tools or [], **kwargs}
+        return self._post_json("/chat", data)
+class ASRClient(BaseRemoteClient):
+    def transcribe(self, audio_path: str, **kwargs) -> Dict[str, Any]:
+        if self.use_gradio and self._client:
+            out = self._client.predict(audio_path, api_name="/predict")
+            if isinstance(out, str):
+                return {"text": out}
+            return out
+        files = {"file": open(audio_path, "rb")}
+        headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else {}
+        r = requests.post(f"{self.base_url}/transcribe", files=files, data=kwargs, headers=headers, timeout=self.timeout)
+        r.raise_for_status()
+        return r.json()