Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 20 days ago

Commit

b17b915

verified ·

1 Parent(s): 7e13059

Upload 15 files

Browse files

Files changed (15) hide show

LICENSE +15 -0
README.md +16 -4
audio_tools.py +725 -0
background_descriptor.py +43 -0
casting_loader.py +340 -0
config.yaml +171 -0
identity_manager.py +125 -0
llm_router.py +67 -0
main_api.py +117 -0
narration_system.py +185 -0
remote_clients.py +96 -0
requirements.txt +38 -4
scripts/client_example.py +33 -0
video_processing.py +141 -0
vision_tools.py +573 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,15 @@

+MIT License
+Copyright (c) 2025 - Dive-tech
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.

README.md CHANGED Viewed

@@ -1,10 +1,22 @@
 ---
-title: veureu-engine
-emoji: 📉
-colorFrom: green
 colorTo: blue
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# ============================
+# README (para Hugging Face Space, FastAPI)
+# ============================
+"""
 ---
+title: Veureu Engine (FastAPI)
+emoji: 🎧
+colorFrom: gray
 colorTo: blue
 sdk: docker
+app_file: main_api.py
 pinned: false
 ---
+# Veureu Engine API
+Endpoints:
+- `POST /process_video` — orquesta `video_processing.py` (audio_tools + vision_tools + identity/background managers).
+- `POST /load_casting` — construye índices de caras y voces en ChromaDB.
+- `POST /refine_narration` — invoca `narration_system.py` para refinar narración y SRT según UNE-153010.
+"""

audio_tools.py ADDED Viewed

	@@ -0,0 +1,725 @@

+# audio_tools.py
+# -----------------------------------------------------------------------------
+# Veureu — AUDIO utilities (self-contained)
+#  - FFmpeg extraction (WAV)
+#  - Diarization (pyannote)
+#  - ASR:
+#      * Catalan ("ca") -> AINA Whisper
+#      * Other languages -> Lightweight generic Whisper
+#  - Integrated Language ID (Whisper via faster-whisper)
+#  - Voice embeddings (SpeechBrain ECAPA)
+#  - Speaker identification (KMeans + optional ChromaDB collection)
+#  - SRT generation
+#  - Orchestrator: process_audio_for_video(...)
+#  - ADDED: ASR of full audio and LLM-based SRT correction
+# -----------------------------------------------------------------------------
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import json
+import logging
+import math
+import os
+import shlex
+import subprocess
+import numpy as np
+import torch
+import torchaudio
+import torchaudio.transforms as T
+from pydub import AudioSegment
+from pyannote.audio import Pipeline
+from speechbrain.pretrained import SpeakerRecognition
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from transformers import WhisperForConditionalGeneration, WhisperProcessor
+from openai import OpenAI as OpenAIClient
+import noisereduce as nr
+# -------------------------------- Logging ------------------------------------
+log = logging.getLogger("audio_tools")
+if not log.handlers:
+    _h = logging.StreamHandler()
+    _h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+    log.addHandler(_h)
+log.setLevel(logging.INFO)
+# ------------------------------- Utilities -----------------------------------
+def _pick_device_auto(dev_cfg: str) -> str:
+    """Resolve 'auto' device to cuda/cpu."""
+    if dev_cfg == "auto":
+        return "cuda" if torch.cuda.is_available() else "cpu"
+    return dev_cfg
+def load_config(path: str = "configs/config_veureu.yaml") -> Dict[str, Any]:
+    p = Path(path)
+    if not p.exists():
+        log.warning("Config file not found: %s (using defaults)", path)
+        return {}
+    try:
+        import yaml
+        cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+        cfg["__path__"] = str(p)
+        return cfg
+    except Exception as e:
+        log.error("Failed to read YAML config: %s", e)
+        return {}
+# ------------------------------- Extraction ----------------------------------
+def extract_audio_ffmpeg(
+    video_path: str,
+    audio_out: Path,
+    sr: int = 16000,
+    mono: bool = True,
+) -> str:
+    """Extract audio from video to WAV using ffmpeg."""
+    audio_out.parent.mkdir(parents=True, exist_ok=True)
+    cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
+    subprocess.run(
+        shlex.split(cmd),
+        check=True,
+        stdout=subprocess.DEVNULL,
+        stderr=subprocess.DEVNULL,
+    )
+    return str(audio_out)
+# ----------------------------------- ASR -------------------------------------
+@dataclass
+class AinaASR:
+    """ASR for Catalan using the AINA Whisper model."""
+    model_name: str = "projecte-aina/whisper-large-v3-ca-3catparla"
+    device: str = "cuda"
+    def __post_init__(self):
+        dev = self.device
+        if dev == "cuda" and not torch.cuda.is_available():
+            dev = "cpu"
+        self.processor = WhisperProcessor.from_pretrained(self.model_name)
+        self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(dev)
+        self.device = dev
+        log.info(f"ASR AINA loaded on {self.device}: {self.model_name}")
+    def transcribe_wav(self, wav_path: str) -> str:
+        waveform, sr = torchaudio.load(wav_path)
+        inputs = self.processor(
+            waveform.numpy(), sampling_rate=sr, return_tensors="pt"
+        ).input_features.to(self.model.device)
+        with torch.no_grad():
+            ids = self.model.generate(inputs, max_new_tokens=440)[0]
+        txt = self.processor.decode(ids)
+        norm = getattr(self.processor.tokenizer, "_normalize", None)
+        return norm(txt) if callable(norm) else txt
+    def transcribe_long_audio(
+        self,
+        wav_path: str,
+        chunk_length_s: int = 20,
+        overlap_s: int = 2,
+    ) -> str:
+        waveform, sr = torchaudio.load(wav_path)
+        total_samples = waveform.shape[1]
+        chunk_size = chunk_length_s * sr
+        overlap_size = overlap_s * sr
+        transcriptions = []
+        start = 0
+        while start < total_samples:
+            end = min(start + chunk_size, total_samples)
+            chunk = waveform[:, start:end]
+            input_features = self.processor(
+                chunk.numpy(),
+                sampling_rate=sr,
+                return_tensors="pt"
+            ).input_features.to(self.model.device)
+            with torch.no_grad():
+                predicted_ids = self.model.generate(
+                    input_features,
+                    max_new_tokens=440,
+                    num_beams=1,   # puedes probar beam search
+                )[0]
+            text = self.processor.decode(predicted_ids, skip_special_tokens=True)
+            transcriptions.append(text.strip())
+            # avanzar con solapamiento
+            start += chunk_size - overlap_size
+        return " ".join(transcriptions).strip()
+@dataclass
+class WhisperASR:
+    """Lightweight generic ASR based on Whisper for non-Catalan languages."""
+    model_name: str = "openai/whisper-small"  # change to 'base' for an even lighter model
+    device: str = "cuda"
+    language: Optional[str] = None  # force language, e.g. "es", "en", etc.
+    def __post_init__(self):
+        dev = self.device
+        if dev == "cuda" and not torch.cuda.is_available():
+            dev = "cpu"
+        self.processor = WhisperProcessor.from_pretrained(self.model_name)
+        self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(dev)
+        self.device = dev
+        log.info(f"ASR Whisper loaded on {self.device}: {self.model_name} (lang hint: {self.language})")
+    def transcribe_wav(self, wav_path: str) -> str:
+        waveform, sr = torchaudio.load(wav_path)
+        inputs = self.processor(
+            waveform.numpy(), sampling_rate=sr, return_tensors="pt"
+        ).input_features.to(self.model.device)
+        gen_kwargs: Dict[str, Any] = dict(max_new_tokens=444)
+        if self.language and self.language != "auto":
+            try:
+                forced_ids = self.processor.get_decoder_prompt_ids(
+                    language=self.language, task="transcribe"
+                )
+                gen_kwargs["forced_decoder_ids"] = forced_ids
+            except Exception:
+                # If the model/processor does not support forced ids, continue without forcing
+                pass
+        with torch.no_grad():
+            ids = self.model.generate(inputs, **gen_kwargs)[0]
+        txt = self.processor.decode(ids)
+        norm = getattr(self.processor.tokenizer, "_normalize", None)
+        return norm(txt) if callable(norm) else txt
+# ------------------------------ Language ID ----------------------------------
+@dataclass
+class WhisperLIDConfig:
+    """Configuration for language detection with faster-whisper."""
+    model_name: str = "Systran/faster-whisper-small"
+    device: str = "auto"
+    compute_type: str = "float32"  # "int8" | "float16" | "float32"
+    beam_size: int = 1
+    chunk_seconds: float = 30.0
+    prob_threshold: float = 0.5
+    fallback_lang: str = "auto"
+def detect_language_with_whisper(
+    wav_path: str,
+    cfg: Dict[str, Any],
+) -> Tuple[str, float]:
+    """
+    Detects language using faster-whisper (WhisperModel). Returns (lang_iso, prob).
+    In case of failure, returns (fallback_lang, 0.0).
+    """
+    lid_cfg_d = (cfg.get("asr", {})
+                 .get("language_detection", {})
+                 .get("whisper_lid", {}))
+    lid_cfg = WhisperLIDConfig(
+        model_name=lid_cfg_d.get("model_name", "Systran/faster-whisper-small",),
+        device=_pick_device_auto(lid_cfg_d.get("device", "auto")),
+        compute_type=lid_cfg_d.get("compute_type", "float32"),
+        beam_size=int(lid_cfg_d.get("beam_size", 1)),
+        chunk_seconds=float(lid_cfg_d.get("chunk_seconds", 30.0)),
+        prob_threshold=float(lid_cfg_d.get("prob_threshold", 0.5)),
+        fallback_lang=lid_cfg_d.get("fallback_lang", "auto"),
+    )
+    try:
+        from faster_whisper import WhisperModel  # type: ignore
+    except Exception as e:
+        log.warning(f"LID: faster-whisper not available ({e}). Fallback='{lid_cfg.fallback_lang}'")
+        return lid_cfg.fallback_lang, 0.0
+    try:
+        model = WhisperModel(lid_cfg.model_name, device=lid_cfg.device, compute_type=lid_cfg.compute_type)
+    except Exception as e:
+        log.warning(f"LID: failed to load '{lid_cfg.model_name}': {e}. Fallback='{lid_cfg.fallback_lang}'")
+        return lid_cfg.fallback_lang, 0.0
+    try:
+        segments, info = model.transcribe(
+            wav_path,
+            beam_size=lid_cfg.beam_size,
+            vad_filter=True,
+            without_timestamps=True,
+            language=None
+        )
+        lang = info.language or lid_cfg.fallback_lang
+        prob = float(info.language_probability or 0.0)
+        if prob < lid_cfg.prob_threshold:
+            return lid_cfg.fallback_lang, prob
+        return lang, prob
+    except Exception as e:
+        log.warning(f"LID: error in transcription/detection: {e}. Fallback='{lid_cfg.fallback_lang}'")
+        return lid_cfg.fallback_lang, 0.0
+def _build_asr_backend_for_language(lang_iso: str, cfg: Dict[str, Any]):
+    """
+    Selects ASR backend based on language:
+      - 'ca' -> AINA
+      - other -> Generic Whisper
+    """
+    asr_cfg = cfg.get("asr", {})
+    device_pref = _pick_device_auto(asr_cfg.get("device", "auto"))
+    if lang_iso and lang_iso.lower() == "ca":
+        return AinaASR(
+            model_name=asr_cfg.get("model_name", "projecte-aina/whisper-large-v3-ca-3catparla"),
+            device=device_pref,
+        )
+    else:
+        return WhisperASR(
+            model_name=asr_cfg.get("whisper_model_name", "openai/whisper-small"),
+            device=device_pref,
+            language=None,
+        )
+# -------------------------------- Diarization --------------------------------
+from pathlib import Path
+from typing import List, Dict, Any, Tuple
+from pydub import AudioSegment
+from pyannote.audio import Pipeline
+import math
+def diarize_audio(
+    wav_path: str,
+    base_dir: Path,
+    clips_folder: str = "clips",
+    min_segment_duration: float = 20,
+    max_segment_duration: float = 50.0,
+    hf_token_env: str | None = None,
+) -> Tuple[List[str], List[Dict[str, Any]]]:
+    """Diarization with pyannote and clip export with pydub.
+    Returns clip paths and segments [{'start','end','speaker'}].
+    """
+    audio = AudioSegment.from_wav(wav_path)
+    duration = len(audio) / 1000.0
+    pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=(hf_token_env
+     or os.getenv("HF_TOKEN")))
+    diarization = pipeline(wav_path)
+    clips_dir = (base_dir / clips_folder)
+    clips_dir.mkdir(parents=True, exist_ok=True)
+    clip_paths: List[str] = []
+    segments: List[Dict[str, Any]] = []
+    spk_map: Dict[str, int] = {}
+    prev_end = 0.0  # referencia al final del último segmento exportado
+    for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
+        start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
+        if start < prev_end:
+            start = prev_end
+        if end <= start:
+            continue
+        seg_dur = end - start
+        if seg_dur < min_segment_duration:
+            continue
+        if seg_dur > max_segment_duration:
+            # split long segments
+            n = int(math.ceil(seg_dur / max_segment_duration))
+            sub_d = seg_dur / n
+            for j in range(n):
+                s = start + j * sub_d
+                e = min(end, start + (j + 1) * sub_d)
+                if e <= s:
+                    continue
+                clip = audio[int(s * 1000):int(e * 1000)]
+                cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
+                clip.export(cp, format="wav")
+                if speaker not in spk_map:
+                    spk_map[speaker] = len(spk_map)
+                segments.append({
+                    "start": s,
+                    "end": e,
+                    "speaker": f"SPEAKER_{spk_map[speaker]:02d}"
+                })
+                clip_paths.append(str(cp))
+                prev_end = e
+        else:
+            clip = audio[int(start * 1000):int(end * 1000)]
+            cp = clips_dir / f"segment_{i:03d}.wav"
+            clip.export(cp, format="wav")
+            if speaker not in spk_map:
+                spk_map[speaker] = len(spk_map)
+            segments.append({
+                "start": start,
+                "end": end,
+                "speaker": f"SPEAKER_{spk_map[speaker]:02d}"
+            })
+            clip_paths.append(str(cp))
+            prev_end = end  # actualizar referencia
+    if not segments:
+        # fallback single clip
+        cp = clips_dir / "segment_000.wav"
+        audio.export(cp, format="wav")
+        return [str(cp)], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
+    # sort by start time
+    pairs = sorted(zip(clip_paths, segments), key=lambda x: x[1]["start"])
+    clip_paths, segments = [p[0] for p in pairs], [p[1] for p in pairs]
+    return clip_paths, segments
+# ------------------------------ Voice embeddings -----------------------------
+class VoiceEmbedder:
+    def __init__(self):
+        self.model = SpeakerRecognition.from_hparams(
+            source="speechbrain/spkrec-ecapa-voxceleb",
+            savedir="pretrained_models/spkrec-ecapa-voxceleb",
+        )
+        self.model.eval()
+    def embed(self, wav_path: str) -> List[float]:
+        waveform, sr = torchaudio.load(wav_path)
+        target_sr = 16000
+        if sr != target_sr:
+            waveform = T.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        # ensure minimum length (~0.2s) for stability
+        min_samples = int(0.2 * target_sr)
+        if waveform.shape[1] < min_samples:
+            pad = min_samples - waveform.shape[1]
+            waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
+        with torch.no_grad():
+            emb = (
+                self.model.encode_batch(waveform)
+                .squeeze()
+                .cpu()
+                .numpy()
+                .astype(float)
+            )
+        return emb.tolist()
+def embed_voice_segments(clip_paths: List[str]) -> List[List[float]]:
+    ve = VoiceEmbedder()
+    out: List[List[float]] = []
+    for cp in clip_paths:
+        try:
+            out.append(ve.embed(cp))
+        except Exception as e:
+            log.warning(f"Embedding error in {cp}: {e}")
+            out.append([])
+    return out
+# --------------------------- Speaker identification --------------------------
+def identify_speakers(
+    embeddings: List[List[float]],
+    voice_collection,  # ChromaDB collection with .query or None
+    cfg: Dict[str, Any],
+) -> List[str]:
+    voice_cfg = cfg.get("voice_processing", {}).get("speaker_identification", {})
+    if not embeddings or sum(1 for e in embeddings if e) < 2:
+        return ["SPEAKER_00" for _ in embeddings]
+    valid = [e for e in embeddings if e and len(e) > 0]
+    if len(valid) < 2:
+        return ["SPEAKER_00" for _ in embeddings]
+    min_clusters = max(1, voice_cfg.get("min_speakers", 1))
+    max_clusters = min(voice_cfg.get("max_speakers", 5), len(valid) - 1)
+    # buscar k óptimo usando silhouette_score
+    if voice_cfg.get("find_optimal_clusters", True) and len(valid) > 2:
+        best_score = -1.0
+        best_k = min_clusters
+        for k in range(min_clusters, max_clusters + 1):
+            if k >= len(valid):
+                break
+            km = KMeans(n_clusters=k, random_state=42, n_init="auto")
+            labels = km.fit_predict(valid)
+            if len(set(labels)) > 1:
+                score = silhouette_score(valid, labels)
+                if score > best_score:
+                    best_score, best_k = score, k
+    else:
+        best_k = min(max_clusters, max(min_clusters, voice_cfg.get("num_speakers", 2)))
+    best_k = max(1, min(best_k, len(valid) - 1))
+    # clustering final
+    km = KMeans(n_clusters=best_k, random_state=42, n_init="auto", init="k-means++")
+    labels = km.fit_predict(np.array(valid))
+    centers = km.cluster_centers_
+    cluster_to_name: Dict[int, str] = {}
+    unknown_counter = 0
+    for cid in range(best_k):
+        center = centers[cid].tolist()
+        name = f"SPEAKER_{cid:02d}"
+        if voice_collection is not None:
+            try:
+                q = voice_collection.query(query_embeddings=[center], n_results=1)
+                metas = q.get("metadatas", [[]])[0]
+                dists = q.get("distances", [[]])[0]
+                thr = voice_cfg.get("distance_threshold")
+                if dists and thr is not None and dists[0] > thr:
+                    # nuevo hablante → marcar como UNKNOWN y guardar en la colección
+                    name = f"UNKNOWN_{unknown_counter}"
+                    unknown_counter += 1
+                    voice_collection.add(
+                        embeddings=[center],
+                        metadatas=[{"name": name}],
+                        ids=[f"unk_{cid}_{unknown_counter}"]
+                    )
+                else:
+                    # coincidencia aceptable → usar nombre existente
+                    if metas and isinstance(metas[0], dict):
+                        name = metas[0].get("nombre") or metas[0].get("name") \
+                               or metas[0].get("speaker") or metas[0].get("identity") \
+                               or name
+            except Exception as e:
+                log.warning(f"Voice KNN query failed: {e}")
+        cluster_to_name[cid] = name
+    # mapear cada embedding a su hablante
+    personas: List[str] = []
+    vi = 0
+    for emb in embeddings:
+        if not emb:
+            personas.append("UNKNOWN")
+        else:
+            label = int(labels[vi])
+            personas.append(cluster_to_name.get(label, f"SPEAKER_{label:02d}"))
+            vi += 1
+    return personas
+# ----------------------------------- SRT -------------------------------------
+def _fmt_srt_time(seconds: float) -> str:
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int(round((seconds - int(seconds)) * 1000))
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+def generate_srt_from_diarization(
+    diarization_segments: List[Dict[str, Any]],
+    transcriptions: List[str],
+    speakers_per_segment: List[str],
+    output_srt_path: str,
+    cfg: Dict[str, Any],
+) -> None:
+    subs = cfg.get("subtitles", {})
+    max_cpl = int(subs.get("max_chars_per_line", 42))
+    max_lines = int(subs.get("max_lines_per_cue", 10))
+    speaker_display = subs.get("speaker_display", "brackets")
+    items: List[Dict[str, Any]] = []
+    n = min(len(diarization_segments), len(transcriptions), len(speakers_per_segment))
+    for i in range(n):
+        seg = diarization_segments[i]
+        text = (transcriptions[i] or "").strip()
+        spk = speakers_per_segment[i]
+        items.append(
+            {
+                "start": float(seg.get("start", 0.0)),
+                "end": float(seg.get("end", 0.0)),
+                "text": text,
+                "speaker": spk,
+            }
+        )
+    out = Path(output_srt_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    with out.open("w", encoding="utf-8-sig") as f:
+        for i, it in enumerate(items, 1):
+            text = it["text"]
+            spk = it["speaker"]
+            if speaker_display == "brackets" and spk:
+                text = f"[{spk}]: {text}" # Adjusted format to match new script's style
+            elif speaker_display == "prefix" and spk:
+                text = f"{spk}: {text}"
+            # wrap simple
+            words = text.split()
+            lines: List[str] = []
+            cur = ""
+            for w in words:
+                if len(cur) + len(w) + (1 if cur else 0) <= max_cpl:
+                    cur = (cur + " " + w) if cur else w
+                else:
+                    lines.append(cur)
+                    cur = w
+                    if len(lines) >= max_lines - 1:
+                        break
+            if cur and len(lines) < max_lines:
+                lines.append(cur)
+            f.write(f"{i}\n{_fmt_srt_time(it['start'])} --> {_fmt_srt_time(it['end'])}\n")
+            f.write("\n".join(lines) + "\n\n")
+# ------------------------------ Orchestrator ---------------------------------
+def process_audio_for_video(
+    video_path: str,
+    out_dir: Path,
+    cfg: Dict[str, Any],
+    voice_collection=None,
+) -> Tuple[List[Dict[str, Any]], Optional[str]]:
+    """
+    Audio pipeline: FFmpeg -> diarization -> LID -> ASR -> embeddings -> speaker-ID -> SRT.
+    Returns (audio_segments, srt_path or None).
+    """
+    # 1) Audio extraction
+    audio_cfg = cfg.get("audio_processing", {})
+    sr = int(audio_cfg.get("sample_rate", 16000))
+    fmt = audio_cfg.get("format", "wav")
+    wav_path = extract_audio_ffmpeg(
+        video_path, out_dir / f"{Path(video_path).stem}.{fmt}", sr=sr
+    )
+    log.info("Audio extraído")
+    # 2) Diarización
+    diar_cfg = audio_cfg.get("diarization", {})
+    min_dur = float(diar_cfg.get("min_segment_duration", 0.5))
+    max_dur = float(diar_cfg.get("max_segment_duration", 10.0))
+    clip_paths, diar_segs = diarize_audio(
+        wav_path, out_dir, "clips", min_dur, max_dur
+    )
+    log.info("Clips de audio generados.")
+    # 3) Detección de idioma (opcional) + Selección de backend ASR
+    asr_cfg = cfg.get("asr", {})
+    lid_enabled = bool(asr_cfg.get("language_detection", {}).get("enabled", True))
+    device_pref = _pick_device_auto(asr_cfg.get("device", "auto"))
+    aina_asr = AinaASR(model_name=asr_cfg.get("model_name", "projecte-aina/whisper-large-v3-ca-3catparla"),
+                       device=device_pref)
+    whisper_asr = WhisperASR(model_name=asr_cfg.get("whisper_model_name", "openai/whisper-small"),
+                             device=device_pref,
+                             language=None)
+    full_transcription = ""
+    if asr_cfg.get("enable_full_transcription", True):
+        log.info("Iniciando transcripción del audio completo")
+        # Assume Catalan model for full transcription, or add logic to check language
+        full_transcription = aina_asr.transcribe_long_audio(wav_path, chunk_length_s=30)
+        log.info("Transcripción completa del audio finalizada.")
+        print(full_transcription)
+    # Transcribe each segment
+    log.info("Comenzamos con la transcripción de cada clip.")
+    trans: List[str] = []
+    detected_langs: List[str] = []
+    detected_probs: List[float] = []
+    for path in clip_paths:
+        if not lid_enabled:
+            txt = aina_asr.transcribe_wav(path)
+        else:
+            detected_lang, detected_prob = detect_language_with_whisper(path, cfg)
+            log.info(f"LID: detected={detected_lang} (p={detected_prob:.2f})")
+            if detected_lang.lower() in ["ca", "catalan"]:
+                txt = aina_asr.transcribe_wav(path)
+            else:
+                txt = whisper_asr.transcribe_wav(path)
+        trans.append(txt)
+    log.info("Se han transcrito todos los clips.")
+    # 5) Embeddings + Identificación de hablantes
+    if audio_cfg.get("enable_voice_embeddings", True):
+        embeddings = embed_voice_segments(clip_paths)
+        log.info("Embeddings creados de manera correcta para cada clip.")
+    else:
+        embeddings = [[] for _ in clip_paths]
+    if cfg.get("voice_processing", {}).get("speaker_identification", {}).get("enabled", True):
+        speakers = identify_speakers(embeddings, voice_collection, cfg)
+        log.info("Speakers identificados de manera correcta.")
+    else:
+        speakers = [seg.get("speaker", f"SPEAKER_{i:02d}") for i, seg in enumerate(diar_segs)]
+    # 6) Construir tabla de segmentos
+    audio_segments: List[Dict[str, Any]] = []
+    for i, seg in enumerate(diar_segs):
+        audio_segments.append(
+            {
+                "segment": i,
+                "start": float(seg.get("start", 0.0)),
+                "end": float(seg.get("end", 0.0)),
+                "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
+                "text": trans[i] if i < len(trans) else "",
+                "voice_embedding": embeddings[i],
+                "clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
+                "lang": detected_langs[i] if i < len(detected_langs) else "auto",
+                "lang_prob": detected_probs[i] if i < len(detected_probs) else 0.0,
+            }
+        )
+    # 7) SRT
+    srt_base_path = out_dir / f"transcripcion_diarizada_{Path(video_path).stem}"
+    srt_unmodified_path = str(srt_base_path) + "_unmodified.srt"
+    # Generate initial SRT
+    try:
+        generate_srt_from_diarization(
+            diar_segs,
+            [a["text"] for a in audio_segments],
+            [a["speaker"] for a in audio_segments],
+            srt_unmodified_path,
+            cfg,
+        )
+    except Exception as e:
+        log.warning(f"SRT generation failed: {e}")
+        srt_unmodified_path = None
+    return audio_segments,  srt_unmodified_path, full_transcription
+# ----------------------------------- CLI -------------------------------------
+if __name__ == "__main__":
+    import argparse
+    import yaml
+    ap = argparse.ArgumentParser(description="Veureu — Audio tools (self-contained)")
+    ap.add_argument("--video", required=True)
+    ap.add_argument("--out", default="results")
+    ap.add_argument("--config", default="configs/config_veureu.yaml")
+    args = ap.parse_args()
+    cfg: Dict[str, Any] = {}
+    p = Path(args.config)
+    if p.exists():
+        try:
+            cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+        except Exception as e:
+            log.warning(f"No se pudo leer el YAML de config: {e}")
+    out_dir = Path(args.out) / Path(args.video).stem
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # Aggiungi una chiave API di OpenAI al tuo file di configurazione o qui
+    # Esempio: cfg["api_keys"] = {"openai": "sk-your-openai-api-key"}
+    # Assicurati di non commettere la chiave in git!
+    segs, srt = process_audio_for_video(args.video, out_dir, cfg, voice_collection=None)
+    print(json.dumps(
+        {
+            "segments": len(segs),
+            "srt": srt,
+            "detected_lang": (segs[0].get("lang") if segs else "auto"),
+            "detected_prob": (segs[0].get("lang_prob") if segs else 0.0),
+        },
+        indent=2,
+        ensure_ascii=False,
+    ))

background_descriptor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# ================================
+# PATCH: background_descriptor.py (describe_keyframes_with_llm)
+# ================================
+# Sustituimos la llamada directa a describe_montage_sequence por router.vision_describe
+# y mantenemos como fallback la función existente.
+# (reemplaza la función en este archivo por esta versión)
+def describe_keyframes_with_llm(
+    keyframes: List[Dict[str, Any]],
+    out_dir: Path,
+    face_identities: Optional[set] = None,
+    config_path: str | None = None,
+) -> Tuple[List[Dict[str, Any]], Optional[str]]:
+    from llm_router import load_yaml, LLMRouter
+    cfg = load_yaml(config_path or "config.yaml")
+    model_name = (cfg.get("background_descriptor", {}).get("description", {}) or {}).get("model", "salamandra-vision")
+    frame_paths = [k.get("image_path") for k in keyframes if k.get("image_path")]
+    montage_dir = out_dir / "montage"
+    montage_path = None
+    if frame_paths:
+        montage_path = generar_montage(frame_paths, montage_dir)
+        context = {
+            "informacion": [{k: v for k, v in fr.items() if k in ("start", "end", "ocr", "faces")} for fr in keyframes],
+            "face_identities": sorted(list(face_identities or set()))
+        }
+        try:
+            router = LLMRouter(cfg)
+            descs = router.vision_describe(frame_paths, context=context, model=model_name)
+        except Exception:
+            # Fallback a implementación local existente si falla el remoto
+            descs = describe_montage_sequence(
+                montage_path=str(montage_path),
+                n=len(frame_paths),
+                informacion=keyframes,
+                face_identities=face_identities or set(),
+                config_path=config_path or "config_veureu.yaml",
+            )
+        for i, fr in enumerate(keyframes):
+            if i < len(descs):
+                fr["description"] = descs[i]
+    return keyframes, str(montage_path) if montage_path else None

casting_loader.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# identity_encoding.py (updated to use libs/*)
+# Veureu — Identity Encoder (faces, voices, scenarios)
+# -----------------------------------------------------------------------------
+# This script replaces the original `identity_encoding.py` but **reuses**
+# as much as possible the functions already present in `libs/`.
+# It respects the project's path structure (identities/*, scenarios, chroma_db,
+# results) and maintains the classic pipeline:
+#   1) index_faces (ChromaDB)
+#   2) identity_features.csv
+#   3) index_voices (ChromaDB)
+#   4) scenarios_descriptions.csv
+#   5) index_scenarios (ChromaDB)
+# -----------------------------------------------------------------------------
+from __future__ import annotations
+import argparse
+import csv
+import logging
+import sys
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+# ============================ LOGGING ========================================
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+log = logging.getLogger("identity_encoding")
+# ============================ DEPENDENCIES ===================================
+# ChromaDB (persistente)
+try:
+    import chromadb
+    from chromadb.config import Settings  # noqa: F401
+except Exception as e:
+    chromadb = None  # type: ignore
+    log.error("No se pudo importar chromadb: %s", e)
+from libs.vision_tools_salamandra import FaceAnalyzer
+from collections import Counter
+# Audio: reuse get_embedding from the existing pipeline
+from libs.audio_tools_ana_2 import VoiceEmbedder
+from libs.vision_tools_salamandra import FaceOfImageEmbedding
+# Optional
+try:
+    import numpy as np
+except Exception:
+    np = None  # type: ignore
+# ============================ UTILITIES =====================================
+IMG_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
+AUD_EXT = {".wav", ".mp3", ".flac", ".m4a", ".ogg"}
+def list_files(root: Path, exts: Iterable[str]) -> List[Path]:
+    root = Path(root)
+    if not root.exists():
+        return []
+    return [p for p in root.rglob('*') if p.suffix.lower() in exts]
+def ensure_chroma(db_dir: Path):
+    if chromadb is None:
+        raise RuntimeError("chromadb no instalado. pip install chromadb")
+    db_dir.mkdir(parents=True, exist_ok=True)
+    # Nueva forma de crear un cliente persistente
+    client = chromadb.Client(Settings(
+        chroma_db_impl="duckdb+parquet",
+        persist_directory=str(db_dir)
+    ))
+    return client
+# ============================ 1) INDEX FACES =================================
+def build_faces_index(faces_dir: Path, client, collection_name: str = "index_faces",
+                      deepface_model: str = 'Facenet512', drop: bool = True) -> int:
+    # idempotency
+    if collection_name in [c.name for c in client.list_collections()] and drop:
+        client.delete_collection(name=collection_name)
+    col = client.get_or_create_collection(name=collection_name)
+    be = FaceOfImageEmbedding(deepface_model=deepface_model)
+    count = 0
+    registered_identities = set()  # 👈 para no repetir nombres
+    for ident_dir in sorted(Path(faces_dir).iterdir() if Path(faces_dir).exists() else []):
+        if not ident_dir.is_dir():
+            continue
+        ident = ident_dir.name
+        for img_path in list_files(ident_dir, IMG_EXT):
+            embeddings = be.encode_image(img_path)
+            if embeddings is None:
+                log.warning("No face embedding in %s", img_path)
+                continue
+            # Aplanar para que cada embedding sea una lista de floats
+            for e in (embeddings if isinstance(embeddings[0], list) else [embeddings]):
+                uid = str(uuid.uuid4())
+                col.add(ids=[uid], embeddings=[e], metadatas=[{"identity": ident, "path": str(img_path)}])
+                count += 1
+                registered_identities.add(ident)  # 👈 guardamos el nombre
+    # Mensajes finales
+    print("Ha acabado de crear la base de datos.")
+    print(f"Total de embeddings guardados: {count}")
+    print("Identidades registradas:")
+    for name in sorted(registered_identities):
+        print(f" - {name}")
+    log.info("index_faces => %d embeddings", count)
+    return count
+# ===================== 2) IDENTITY FEATURES CSV ==============================
+def aggregate_face_attributes(faces_dir: Path, out_csv: Path) -> int:
+    """
+    Procesa un directorio de caras por identidad y genera un CSV con edad y género.
+    Usa FaceAnalyzer para extraer atributos.
+    """
+    # Inicializa el analizador
+    from libs.vision_tools_salamandra import FaceAnalyzer
+    analyzer = FaceAnalyzer()
+    rows: List[Dict[str, Any]] = []
+    faces_dir = Path(faces_dir)
+    if not faces_dir.exists() or not faces_dir.is_dir():
+        log.error("El directorio de caras no existe: %s", faces_dir)
+        return 0
+    def most_common(lst, default="unknown"):
+        return Counter(lst).most_common(1)[0][0] if lst else default
+    # Itera sobre cada identidad
+    for ident_dir in sorted(faces_dir.iterdir()):
+        if not ident_dir.is_dir():
+            continue
+        ident = ident_dir.name
+        attrs: List[Dict[str, Any]] = []
+        log.info("Procesando identidad: %s", ident)
+        for img_path in sorted(list_files(ident_dir, IMG_EXT)):
+            try:
+                data = analyzer.analyze_image(str(img_path))
+                if data:
+                    attrs.append(data)
+            except Exception as e:
+                log.warning("Error procesando imagen %s: %s", img_path, e)
+        genders = [a.get("gender", "unknown") for a in attrs]
+        ages = [a.get("age", "unknown") for a in attrs]
+        # Contexto opcional por identidad
+        context_txt = (faces_dir.parent / "context" / f"{ident}.txt")
+        identity_context = context_txt.read_text(encoding="utf-8").strip() if context_txt.exists() else ""
+        rows.append({
+            "identity": ident,
+            "samples": len(attrs),
+            "gender": most_common(genders),
+            "age_bucket": most_common(ages),
+            "identity_context": identity_context,
+        })
+        log.info("Procesados %d atributos para %s", len(attrs), ident)
+    # Guardar CSV
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    with out_csv.open("w", newline='', encoding="utf-8") as f:
+        fieldnames = list(rows[0].keys()) if rows else ["identity", "identity_context"]
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(rows)
+    log.info("CSV generado correctamente: %s", out_csv)
+    return len(rows)
+# ============================ 3) INDEX VOICES =================================
+from pydub import AudioSegment  # agregar al inicio de tu archivo junto a otros imports
+def build_voices_index(voices_dir: Path, client, collection_name: str = "index_voices", drop: bool = True) -> int:
+    if collection_name in [c.name for c in client.list_collections()] and drop:
+        client.delete_collection(name=collection_name)
+    col = client.get_or_create_collection(name=collection_name)
+    ve = VoiceEmbedder()
+    count = 0
+    for ident_dir in sorted(Path(voices_dir).iterdir() if Path(voices_dir).exists() else []):
+        if not ident_dir.is_dir():
+            continue
+        ident = ident_dir.name
+        for wav_path in list_files(ident_dir, AUD_EXT):
+            # Intentar embed directamente
+            try:
+                emb = ve.embed(wav_path)
+            except Exception as e:
+                log.warning("Error leyendo audio %s: %s. Intentando reconvertir...", wav_path, e)
+                # Reconversión automática a WAV PCM
+                try:
+                    audio = AudioSegment.from_file(wav_path)
+                    fixed_path = wav_path.with_name(wav_path.stem + "_fixed.wav")
+                    audio.export(fixed_path, format="wav")
+                    log.info("Archivo convertido a WAV compatible: %s", fixed_path)
+                    emb = ve.embed(fixed_path)
+                except Exception as e2:
+                    log.error("No se pudo generar embedding tras reconversión para %s: %s", wav_path, e2)
+                    continue  # saltar este archivo
+            if emb is None:
+                log.warning("No voice embedding en %s", wav_path)
+                continue
+            uid = str(uuid.uuid4())
+            col.add(ids=[uid], embeddings=[emb], metadatas=[{"identity": ident, "path": str(wav_path)}])
+            count += 1
+    log.info("index_voices => %d embeddings", count)
+    return count
+# ============================ 4) SCENARIOS ==================================
+@dataclass
+class VisionClient:
+    provider: str = "none"  # placeholder to plug in an LLM if desired
+    def describe(self, image_path: str, prompt: str) -> str:
+        return (f"Automatic description (placeholder) for {Path(image_path).name}. "
+                f"{prompt}")
+class TextEmbedder:
+    """Text embeddings with Sentence-Transformers if available; fallback to TF-IDF."""
+    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
+        self.kind = "tfidf"; self.model = None; self.vectorizer = None
+        try:
+            from sentence_transformers import SentenceTransformer
+            self.model = SentenceTransformer(model_name)
+            self.kind = "sbert"
+        except Exception:
+            from sklearn.feature_extraction.text import TfidfVectorizer
+            self.vectorizer = TfidfVectorizer(max_features=768)
+    def fit(self, texts: List[str]):
+        if self.vectorizer is not None:
+            self.vectorizer.fit(texts)
+    def encode(self, texts: List[str]) -> List[List[float]]:
+        if self.model is not None:
+            arr = self.model.encode(texts, convert_to_numpy=True)
+            return arr.astype(float).tolist()
+        X = self.vectorizer.transform(texts) if self.vectorizer is not None else None
+        return (X.toarray().astype(float).tolist() if X is not None else [[0.0]*128 for _ in texts])
+def build_scenarios_descriptions(scenarios_dir: Path, out_csv: Path, vision: VisionClient,
+                                 sample_per_scenario: int = 12) -> Tuple[int, List[Dict[str, Any]]]:
+    rows: List[Dict[str, Any]] = []
+    for scen_dir in sorted(Path(scenarios_dir).iterdir() if Path(scenarios_dir).exists() else []):
+        if not scen_dir.is_dir():
+            continue
+        scen = scen_dir.name
+        descs: List[str] = []
+        imgs = list_files(scen_dir, IMG_EXT)[:sample_per_scenario]
+        for img in imgs:
+            d = vision.describe(str(img), prompt="Describe location, time period, lighting, and atmosphere without mentioning people or time of day.")
+            if d:
+                descs.append(d)
+        if not descs:
+            descs = [f"Scenario {scen} (no images)"]
+        rows.append({"scenario": scen, "descriptions": " \n".join(descs)})
+    out_csv.parent.mkdir(parents=True, exist_ok=True)
+    with out_csv.open("w", newline='', encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=["scenario", "descriptions"])
+        w.writeheader(); w.writerows(rows)
+    log.info("scenarios_descriptions => %s", out_csv)
+    return len(rows), rows
+def build_scenarios_index(client, rows: List[Dict[str, Any]], embedder: TextEmbedder,
+                           collection_name: str = "index_scenarios", drop: bool = True) -> int:
+    texts = [r["descriptions"] for r in rows]
+    embedder.fit(texts)
+    embs = embedder.encode(texts)
+    if collection_name in [c.name for c in client.list_collections()] and drop:
+        client.delete_collection(name=collection_name)
+    col = client.get_or_create_collection(name=collection_name)
+    for r, e in zip(rows, embs):
+        col.add(ids=[r["scenario"]], embeddings=[e], metadatas=[{"scenario": r["scenario"]}])
+    log.info("index_scenarios => %d descriptions", len(rows))
+    return len(rows)
+# ================================ CLI ========================================
+def main():
+    ap = argparse.ArgumentParser(description="Veureu — Build identity/scenario indices and CSVs")
+    ap.add_argument('--faces_dir', default='identities/faces', help='Root directory of face images per identity')
+    ap.add_argument('--voices_dir', default='identities/voices', help='Root directory of voice clips per identity')
+    ap.add_argument('--scenarios_dir', default='scenarios', help='Root directory of scenario folders with images')
+    ap.add_argument('--db_dir', default='chroma_db', help='ChromaDB persistence directory')
+    ap.add_argument('--out_dir', default='results', help='Output directory for CSVs')
+    ap.add_argument('--drop_collections', action='store_true', help='Delete collections if they exist before rebuilding')
+    ap.add_argument('--deepface_model', default='Facenet512', help='DeepFace model to use as fallback')
+    ap.add_argument('--scenario_samples', type=int, default=12, help='Number of images per scenario to describe')
+    args = ap.parse_args()
+    faces_dir = Path(args.faces_dir)
+    voices_dir = Path(args.voices_dir)
+    print(voices_dir)
+    scenarios_dir = Path(args.scenarios_dir)
+    out_dir = Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True)
+    client = ensure_chroma(Path(args.db_dir))
+    # 1) Faces index
+    build_faces_index(faces_dir, client, collection_name="index_faces", deepface_model=args.deepface_model, drop=args.drop_collections)
+    # 2) Identity features CSV
+    #id_csv = out_dir / 'identity_features.csv'
+    #aggregate_face_attributes(faces_dir, id_csv)
+    # 3) Voices index
+    build_voices_index(voices_dir, client, collection_name="index_voices", drop=args.drop_collections)
+    # 4) Scenarios descriptions
+    #vision = VisionClient()
+    #scen_csv = out_dir / 'scenarios_descriptions.csv'
+    #_, scen_rows = build_scenarios_descriptions(scenarios_dir, scen_csv, vision, sample_per_scenario=args.scenario_samples)
+    # 5) Scenarios index
+    #embedder = TextEmbedder()
+    #build_scenarios_index(client, scen_rows, embedder, collection_name="index_scenarios", drop=args.drop_collections)
+    log.info("✅ Identity encoding completed.")
+if __name__ == '__main__' and '--video' not in sys.argv:
+    main()

config.yaml ADDED Viewed

	@@ -0,0 +1,171 @@

+# ===========================
+# Veureu Engine – config.yaml
+# ===========================
+engine:
+  # Salida de artefactos
+  output_root: "results"
+  # Persistencia de índices vectoriales
+  database:
+    enabled: true
+    persist_directory: "chroma_db"
+    enable_face_recognition: true
+    enable_voice_recognition: true
+    face_collection: "index_faces"
+    voice_collection: "index_voices"
+  # Jobs asíncronos (si implementas el patrón de cola)
+  jobs:
+    enabled: true
+    max_workers: 1            # Ajusta según recursos del Space
+    result_ttl_seconds: 86400 # 1 día
+api:
+  cors_allow_origins: ["*"]
+  # Tiempo máximo (segundos) de una petición síncrona (si usas el endpoint sync)
+  sync_timeout_seconds: 3600
+video_processing:
+  # Metadatos de extracción de frames
+  keyframes:
+    # Si tu extractor condicional usa umbrales, puedes incluirlos aquí:
+    conditional_extraction:
+      enable: true
+      # ejemplos de parámetros (ajústalos a tu extractor real)
+      min_scene_length_seconds: 1.5
+      difference_threshold: 28.0
+  frames_per_second:
+    enable: true
+    fps: 1.0  # "frecuencia de frames de análisis" (1 frame/segundo por defecto)
+  ocr:
+    engine: "tesseract"   # o "easyocr"
+    language_hint: "spa"  # si aplica
+    # Solo si usas pytesseract:
+    tesseract_cmd: ""     # ruta binaria si no está en PATH
+  faces:
+    detector_model: "mtcnn"           # ejemplar; ajústalo a tu vision_tools
+    embedding_model: "Facenet512"     # usado en background_descriptor.FaceOfImageEmbedding
+    min_face_size: 32
+    detection_confidence: 0.85
+  ocr_clustering:
+    method: "sequential_similarity"
+    sentence_transformer: "all-MiniLM-L6-v2"
+    similarity_threshold: 0.60        # "número de clusters" implícito por umbral (más alto ⇒ menos clusters)
+audio_processing:
+  # El ASR principal será remoto si seleccionas whisper catalan (ver models/routing)
+  diarization:
+    enabled: true
+    # ejemplo de parámetros de diarización si los usas en audio_tools
+    min_speaker_duration: 0.8
+    max_speakers: 8
+  speaker_embedding:
+    enabled: true
+    # umbral para asig. de identidad en voice_collection
+    speaker_identification:
+      distance_threshold: 0.40
+  # Si mantienes transcripción local para otros idiomas/modelos:
+  local_asr:
+    enabled: false   # usarás remoto para whisper catalan
+    model: ""        # (vacío porque lo gestionas vía Spaces)
+background_descriptor:
+  # Parámetros del montaje y descripción con LLM
+  montage:
+    enable: true
+    max_frames: 12           # tope de frames en el collage/descripcion
+    grid: "auto"             # o 3x4, etc.
+  description:
+    model: "salamandra-vision"  # puede ser "salamandra-vision" o "gpt-4o-mini"
+    max_tokens: 512
+    temperature: 0.2
+    # Si hay identidades detectadas, se pasan como hints (ya lo hace tu pipeline)
+identity:
+  # Reglas de mapeo temporal y enriquecimiento
+  timeline_mapping:
+    per_second_frames_source: "frames_per_second"
+    attach_faces_to:
+      - "keyframes"
+      - "audio_segments"
+    out_key: "persona"
+narration:
+  model: "salamandra-instruct"   # "salamandra-instruct" | "gpt-4o-mini"
+  une_guidelines_path: "UNE_153010.txt"
+  # Restricciones temporales (para UNE-153010)
+  timing:
+    max_ad_duration_ratio: 0.60  # proporción del hueco disponible que puede ocupar la AD
+    min_gap_seconds: 1.20
+    min_ad_seconds: 0.80
+  llm:
+    max_tokens: 1024
+    temperature: 0.2
+models:
+  # Selección de modelos de alto nivel por tarea
+  instruct: "salamandra-instruct"   # para NarrationSystem y otros textos
+  vision: "salamandra-vision"       # para describir frames/montajes
+  tools: "salamandra-tools"         # si necesitas funciones con tool-calling
+  asr: "whisper-catalan"            # ASR catalán
+  # Enrutado: qué modelos se ejecutan REMOTO (vía otros Spaces)
+  routing:
+    use_remote_for:
+      - "salamandra-instruct"
+      - "salamandra-vision"
+      - "salamandra-tools"
+      - "whisper-catalan"
+remote_spaces:
+  # Dónde llamar cuando models.routing decide “remoto”
+  user: "veureu"
+  endpoints:
+    # Nota: rellena las URLs reales cuando publiques los Spaces.
+    salamandra-instruct:
+      space: "schat"
+      base_url: "https://veureu-schat.hf.space"
+      client: "gradio"          # "gradio" o "http"
+      predict_route: "/run/predict"  # si usas gradio_client no necesitas ruta
+    salamandra-vision:
+      space: "svision"
+      base_url: "https://veureu-svision.hf.space"
+      client: "gradio"
+      predict_route: "/run/predict"
+    salamandra-tools:
+      space: "stools"
+      base_url: "https://veureu-stools.hf.space"
+      client: "gradio"
+      predict_route: "/run/predict"
+    whisper-catalan:
+      space: "ars"
+      base_url: "https://veureu-ars.hf.space"
+      client: "gradio"
+      predict_route: "/run/predict"
+  # Parámetros de red y robustez
+  http:
+    timeout_seconds: 120
+    retries: 3
+    backoff_seconds: 2.0
+security:
+  # Si necesitas pasar tokens (p. ej., tokens del Hub o auth propia)
+  use_hf_token: true
+  hf_token_env: "HF_TOKEN"       # nombre de la variable de entorno para el token
+  allow_insecure_tls: false
+logging:
+  level: "INFO"                  # DEBUG | INFO | WARNING | ERROR
+  json: false

identity_manager.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# =========================
+# File: identity_manager.py
+# =========================
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Tuple
+from chromadb.api.types import Collection
+class IdentityManager:
+    """
+    Encapsula toda la lógica de asignación de identidades (caras + voces)
+    y su proyección sobre frames, clips y SRT.
+    """
+    def __init__(self, face_collection: Optional[Collection] = None, voice_collection: Optional[Collection] = None):
+        self.face_collection = face_collection
+        self.voice_collection = voice_collection
+    # --------------------------- Faces / Frames ---------------------------
+    def assign_faces_to_frames(
+        self,
+        frames: List[Dict[str, Any]],
+    ) -> List[Dict[str, Any]]:
+        """
+        `frames` es una lista de dicts con al menos: {image_path, start, end, faces:[{embedding?, bbox?}]}
+        Devuelve los mismos frames con `faces` enriquecidos con `identity` y `distance` si hay DB.
+        """
+        if self.face_collection is None:
+            return frames
+        out = []
+        for fr in frames:
+            faces = fr.get("faces") or []
+            enr: List[Dict[str, Any]] = []
+            for f in faces:
+                emb = f.get("embedding") or f.get("vector")
+                if not emb:
+                    enr.append(f)
+                    continue
+                try:
+                    q = self.face_collection.query(query_embeddings=[emb], n_results=1, include=["metadatas", "distances"])  # type: ignore
+                    metas = q.get("metadatas", [[]])[0]
+                    dists = q.get("distances", [[]])[0]
+                    if metas:
+                        md = metas[0] or {}
+                        f = dict(f)
+                        f["identity"] = md.get("identity") or md.get("name")
+                        if dists:
+                            f["distance"] = float(dists[0])
+                except Exception:
+                    pass
+                enr.append(f)
+            fr2 = dict(fr)
+            fr2["faces"] = enr
+            out.append(fr2)
+        return out
+    # --------------------------- Voices / Segments ------------------------
+    def assign_voices_to_segments(
+        self,
+        audio_segments: List[Dict[str, Any]],
+        distance_threshold: Optional[float] = None,
+    ) -> List[Dict[str, Any]]:
+        """
+        Añade `voice_vecinos` y `voice_identity` a cada segmento si hay colección de voz.
+        """
+        if self.voice_collection is None:
+            return audio_segments
+        out = []
+        for a in audio_segments:
+            emb = a.get("voice_embedding")
+            if not emb:
+                out.append(a)
+                continue
+            try:
+                q = self.voice_collection.query(query_embeddings=[emb], n_results=3, include=["metadatas", "distances"])  # type: ignore
+                metas = q.get("metadatas", [[]])[0]
+                dists = q.get("distances", [[]])[0]
+                vecinos = []
+                top_id = None
+                top_dist = None
+                for m, d in zip(metas, dists):
+                    name = (m or {}).get("identity") or (m or {}).get("name")
+                    vecinos.append({"identity": name, "distance": float(d)})
+                    if top_id is None:
+                        top_id, top_dist = name, float(d)
+                a2 = dict(a)
+                a2["voice_vecinos"] = vecinos
+                if top_id is not None:
+                    if distance_threshold is None or (top_dist is not None and top_dist <= distance_threshold):
+                        a2["voice_identity"] = top_id
+                out.append(a2)
+            except Exception:
+                out.append(a)
+        return out
+    # --------------------------- Map to SRT/Timelines ---------------------
+    @staticmethod
+    def map_identities_over_ranges(
+        per_second_frames: List[Dict[str, Any]],
+        ranges: List[Dict[str, Any]],
+        key: str = "faces",
+        out_key: str = "persona",
+    ) -> List[Dict[str, Any]]:
+        """
+        Para cada rango temporal (keyframes, audio_segments, etc.), agrega quién aparece según los frames por segundo.
+        """
+        out: List[Dict[str, Any]] = []
+        for rng in ranges:
+            s, e = float(rng.get("start", 0.0)), float(rng.get("end", 0.0))
+            present = []
+            for fr in per_second_frames:
+                fs, fe = float(fr.get("start", 0.0)), float(fr.get("end", 0.0))
+                if fe <= s or fs >= e:
+                    continue
+                for f in fr.get(key) or []:
+                    ident = f.get("identity")
+                    if ident and ident not in present:
+                        present.append(ident)
+            r2 = dict(rng)
+            r2[out_key] = present
+            out.append(r2)
+        return out

llm_router.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+# ============================
+# File: llm_router.py
+# ============================
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pathlib import Path
+import yaml
+from remote_clients import InstructClient, VisionClient, ToolsClient, ASRClient
+def load_yaml(path: str) -> Dict[str, Any]:
+    p = Path(path)
+    if not p.exists():
+        return {}
+    return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+class LLMRouter:
+    def __init__(self, cfg: Dict[str, Any]):
+        self.cfg = cfg
+        self.rem = cfg.get("models", {}).get("routing", {}).get("use_remote_for", [])
+        base_user = cfg.get("remote_spaces", {}).get("user", "veureu")
+        eps = cfg.get("remote_spaces", {}).get("endpoints", {})
+        token_enabled = cfg.get("security", {}).get("use_hf_token", False)
+        hf_token = os.getenv(cfg.get("security", {}).get("hf_token_env", "HF_TOKEN")) if token_enabled else None
+        def mk(endpoint_key: str, cls):
+            info = eps.get(endpoint_key, {})
+            base_url = info.get("base_url") or f"https://{base_user}-{info.get('space')}.hf.space"
+            client = cls(base_url=base_url, use_gradio=(info.get("client", "gradio") == "gradio"), hf_token=hf_token,
+                        timeout=int(cfg.get("remote_spaces", {}).get("http", {}).get("timeout_seconds", 120)))
+            return client
+        self.clients = {
+            "salamandra-instruct": mk("salamandra-instruct", InstructClient),
+            "salamandra-vision": mk("salamandra-vision", VisionClient),
+            "salamandra-tools": mk("salamandra-tools", ToolsClient),
+            "whisper-catalan": mk("whisper-catalan", ASRClient),
+        }
+    # ---- INSTRUCT ----
+    def instruct(self, prompt: str, system: Optional[str] = None, model: str = "salamandra-instruct", **kwargs) -> str:
+        if model in self.rem:
+            return self.clients[model].generate(prompt, system=system, **kwargs)  # type: ignore
+        # fallback local (p. ej., gpt-4o-mini o gpt-oss vía tu API local si existiera)
+        # Aquí podrías integrar una API OpenAI-compatible si la tienes.
+        raise RuntimeError(f"Modelo local no implementado para: {model}")
+    # ---- VISION ----
+    def vision_describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, model: str = "salamandra-vision", **kwargs) -> List[str]:
+        if model in self.rem:
+            return self.clients[model].describe(image_paths, context=context, **kwargs)  # type: ignore
+        raise RuntimeError(f"Modelo local no implementado para: {model}")
+    # ---- TOOLS ----
+    def chat_with_tools(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, model: str = "salamandra-tools", **kwargs) -> Dict[str, Any]:
+        if model in self.rem:
+            return self.clients[model].chat(messages, tools=tools, **kwargs)  # type: ignore
+        raise RuntimeError(f"Modelo local no implementado para: {model}")
+    # ---- ASR ----
+    def asr_transcribe(self, audio_path: str, model: str = "whisper-catalan", **kwargs) -> Dict[str, Any]:
+        if model in self.rem:
+            return self.clients[model].transcribe(audio_path, **kwargs)  # type: ignore
+        raise RuntimeError(f"Modelo local no implementado para: {model}")

main_api.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# ============================
+# PATCH: main_api.py (FastAPI) – refine_narration via remote instruct when configured
+# ============================
+from __future__ import annotations
+from fastapi import FastAPI, UploadFile, File, Form
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pathlib import Path
+import shutil
+import uvicorn
+import json
+from video_processing import process_video_pipeline
+from casting_loader import ensure_chroma, build_faces_index, build_voices_index
+from narration_system import NarrationSystem
+from llm_router import load_yaml, LLMRouter
+app = FastAPI(title="Veureu Engine API", version="0.2.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+ROOT = Path("/tmp/veureu"); ROOT.mkdir(parents=True, exist_ok=True)
+@app.get("/")
+def root():
+    return {"ok": True, "service": "veureu-engine"}
+@app.post("/process_video")
+async def process_video(
+    video_file: UploadFile = File(...),
+    config_path: str = Form("config.yaml"),
+    out_root: str = Form("results"),
+    db_dir: str = Form("chroma_db"),
+):
+    tmp_video = ROOT / video_file.filename
+    with tmp_video.open("wb") as f:
+        shutil.copyfileobj(video_file.file, f)
+    result = process_video_pipeline(str(tmp_video), config_path=config_path, out_root=out_root, db_dir=db_dir)
+    return JSONResponse(result)
+@app.post("/load_casting")
+async def load_casting(
+    faces_dir: str = Form("identities/faces"),
+    voices_dir: str = Form("identities/voices"),
+    db_dir: str = Form("chroma_db"),
+    drop_collections: bool = Form(False),
+):
+    client = ensure_chroma(Path(db_dir))
+    n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
+    n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
+    return {"ok": True, "faces": n_faces, "voices": n_voices}
+@app.post("/refine_narration")
+async def refine_narration(
+    dialogues_srt: str = Form(...),
+    frame_descriptions_json: str = Form("[]"),
+    config_path: str = Form("config.yaml"),
+):
+    cfg = load_yaml(config_path)
+    frames = json.loads(frame_descriptions_json)
+    # Si el instruct está configurado como remoto, usamos el router; si no, caemos en NarrationSystem existente
+    model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
+    use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
+    if use_remote:
+        router = LLMRouter(cfg)
+        # Implementación simplificada de refinado usando el modelo instruct remoto.
+        # Mantén la lógica de prompts alineada con NarrationSystem si quieres 1:1.
+        system_msg = (
+            "Eres un sistema de audiodescripción que cumple UNE-153010. "
+            "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
+            "Devuelve JSON con {narrative_text, srt_text}."
+        )
+        prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
+        try:
+            txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
+            out = {}
+            try:
+                out = json.loads(txt)
+            except Exception:
+                out = {"narrative_text": txt, "srt_text": ""}
+            return {
+                "narrative_text": out.get("narrative_text", ""),
+                "srt_text": out.get("srt_text", ""),
+                "approved": True,
+                "critic_feedback": "",
+            }
+        except Exception as e:
+            # Fallback a NarrationSystem local si falla el remoto
+            ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
+            res = ns.run(dialogues_srt, frames)
+            return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
+    # Camino local (usa tu NarrationSystem actual)
+    ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
+    out = ns.run(dialogues_srt, frames)
+    return {
+        "narrative_text": out.narrative_text,
+        "srt_text": out.srt_text,
+        "approved": out.approved,
+        "critic_feedback": out.critic_feedback,
+    }
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

narration_system.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# narrator_agent.py
+from __future__ import annotations
+from typing import Dict, List, Any
+from langgraph.graph import StateGraph, END
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from dataclasses import dataclass
+import json
+import time
+@dataclass
+class NarratorInput:
+    dialogues_srt: str
+    frame_descriptions: List[Dict[str, Any]]  # [{"timestamp": "00:01:23,000", "description": "..."}]
+    une_guidelines_path: str
+    max_cycles: int = 3
+@dataclass
+class NarratorOutput:
+    narrative_text: str
+    srt_text: str
+    critic_feedback: str | None = None
+    approved: bool = False
+class NarrationSystem:
+    """
+    LangGraph-based multi-agent system:
+    - NarratorNode: generates narration + SRT according to UNE-153010
+    - CriticNode: evaluates conformity with UNE and coherence
+    - IdentityManagerNode: adjusts character identification if needed
+    - BackgroundDescriptorNode: fixes background/scene coherence
+    """
+    def __init__(self, model_url: str, une_guidelines_path: str):
+        self.model_url = model_url
+        self.une_guidelines_path = une_guidelines_path
+        # LLM endpoints (each node could use a different deployment if desired)
+        self.narrator_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.6)
+        self.critic_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.3)
+        self.identity_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.4)
+        self.background_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.4)
+        with open(une_guidelines_path, "r", encoding="utf-8") as f:
+            self.une_rules = f.read()
+        # Build LangGraph workflow
+        self.graph = self.build_graph()
+    # -----------------------------------------------------------
+    # LangGraph nodes
+    # -----------------------------------------------------------
+    def narrator_node(self, state):
+        dialogues = state["dialogues_srt"]
+        frames = state["frame_descriptions"]
+        prompt = ChatPromptTemplate.from_template("""
+Eres un narrador de audiodescripciones según la norma UNE-153010.
+Combina coherentemente los diálogos del siguiente SRT con las descripciones de escena dadas.
+Sigue estas pautas:
+- Genera una narración libre que integre ambos tipos de información.
+- Evita redundancias o descripciones triviales.
+- Limita la duración de las audiodescripciones para que quepan entre los diálogos.
+- Devuelve **dos bloques**:
+1️⃣ `NARRATION_TEXT`: narración libre completa en texto continuo.
+2️⃣ `UNE_SRT`: subtítulos con los diálogos y las audiodescripciones UNE.
+## DIÁLOGOS SRT
+{dialogues}
+## DESCRIPCIONES DE FRAMES
+{frames}
+        """)
+        response = self.narrator_llm.invoke(prompt.format(dialogues=dialogues, frames=json.dumps(frames, ensure_ascii=False)))
+        return {"narration": response.content, "critic_feedback": None, "approved": False}
+    def critic_node(self, state):
+        narration = state["narration"]
+        prompt = ChatPromptTemplate.from_template("""
+Actúa como un revisor experto en audiodescripción conforme a la norma UNE-153010.
+Evalúa el siguiente texto y SRT generados, detectando:
+- Incoherencias en asignación de personajes.
+- Errores en la identificación de escenarios.
+- Desviaciones respecto a la norma UNE-153010.
+- Incoherencias narrativas generales.
+Devuelve:
+- "APPROVED" si el resultado es conforme.
+- En caso contrario, una lista JSON con observaciones clasificadas en:
+  - "characters"
+  - "scenes"
+  - "norma"
+  - "coherence"
+## NORMA UNE-153010
+{une_rules}
+## TEXTO Y SRT A EVALUAR
+{narration}
+        """)
+        response = self.critic_llm.invoke(prompt.format(une_rules=self.une_rules, narration=narration))
+        text = response.content.strip()
+        if "APPROVED" in text.upper():
+            return {"critic_feedback": None, "approved": True}
+        return {"critic_feedback": text, "approved": False}
+    def identity_node(self, state):
+        fb = state.get("critic_feedback", "")
+        narration = state["narration"]
+        prompt = ChatPromptTemplate.from_template("""
+El siguiente feedback señala incoherencias en personajes o diálogos.
+Corrige únicamente esos aspectos manteniendo el resto igual.
+## FEEDBACK
+{fb}
+## TEXTO ORIGINAL
+{narration}
+        """)
+        response = self.identity_llm.invoke(prompt.format(fb=fb, narration=narration))
+        return {"narration": response.content}
+    def background_node(self, state):
+        fb = state.get("critic_feedback", "")
+        narration = state["narration"]
+        prompt = ChatPromptTemplate.from_template("""
+El siguiente feedback señala incoherencias en escenarios o contexto visual.
+Ajusta las descripciones de fondo manteniendo el estilo y duración UNE.
+## FEEDBACK
+{fb}
+## TEXTO ORIGINAL
+{narration}
+        """)
+        response = self.background_llm.invoke(prompt.format(fb=fb, narration=narration))
+        return {"narration": response.content}
+    # -----------------------------------------------------------
+    # Graph assembly
+    # -----------------------------------------------------------
+    def build_graph(self):
+        g = StateGraph()
+        g.add_node("NarratorNode", self.narrator_node)
+        g.add_node("CriticNode", self.critic_node)
+        g.add_node("IdentityManagerNode", self.identity_node)
+        g.add_node("BackgroundDescriptorNode", self.background_node)
+        g.set_entry_point("NarratorNode")
+        g.add_edge("NarratorNode", "CriticNode")
+        g.add_conditional_edges(
+            "CriticNode",
+            lambda state: "done" if state.get("approved") else "retry",
+            {
+                "done": END,
+                "retry": "IdentityManagerNode",
+            },
+        )
+        g.add_edge("IdentityManagerNode", "BackgroundDescriptorNode")
+        g.add_edge("BackgroundDescriptorNode", "CriticNode")
+        return g.compile()
+    # -----------------------------------------------------------
+    # Run loop
+    # -----------------------------------------------------------
+    def run(self, dialogues_srt: str, frame_descriptions: List[Dict[str, Any]], max_cycles: int = 3) -> NarratorOutput:
+        state = {"dialogues_srt": dialogues_srt, "frame_descriptions": frame_descriptions}
+        result = self.graph.invoke(state)
+        return NarratorOutput(
+            narrative_text=result.get("narration", ""),
+            srt_text=result.get("narration", ""),  # could be parsed separately if model emits dual block
+            critic_feedback=result.get("critic_feedback"),
+            approved=result.get("approved", False),
+        )

remote_clients.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# ============================
+# File: remote_clients.py
+# ============================
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+import os, json, time
+import requests
+from tenacity import retry, stop_after_attempt, wait_exponential
+try:
+    from gradio_client import Client as GradioClient
+except Exception:  # pragma: no cover
+    GradioClient = None  # type: ignore
+class BaseRemoteClient:
+    def __init__(self, base_url: str, use_gradio: bool = True, hf_token: Optional[str] = None, timeout: int = 120):
+        self.base_url = base_url.rstrip("/")
+        self.use_gradio = use_gradio and GradioClient is not None
+        self.hf_token = hf_token or os.getenv("HF_TOKEN")
+        self.timeout = timeout
+        self._client = None
+        if self.use_gradio:
+            # GradioClient acepta base_url del Space público/privado
+            headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else None
+            self._client = GradioClient(self.base_url, hf_token=self.hf_token, headers=headers)
+    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
+    def _post_json(self, route: str, payload: Dict[str, Any]) -> Dict[str, Any]:
+        url = f"{self.base_url}{route}"
+        headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else {}
+        r = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
+        r.raise_for_status()
+        return r.json()
+class InstructClient(BaseRemoteClient):
+    """Cliente para Space de texto-instrucción (schat)."""
+    def generate(self, prompt: str, system: Optional[str] = None, **kwargs) -> str:
+        if self.use_gradio and self._client:
+            # Asume interfaz Gradio con un único campo de entrada de texto y devuelve texto.
+            # Ajusta "predict" y parámetros a tu Space real (inputs/outputs).
+            out = self._client.predict(prompt, api_name="/predict")
+            return str(out)
+        # HTTP genérico
+        data = {"prompt": prompt, "system": system, **kwargs}
+        res = self._post_json("/generate", data)
+        return res.get("text", "")
+class VisionClient(BaseRemoteClient):
+    """Cliente para Space de visión (svision)."""
+    def describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, **kwargs) -> List[str]:
+        if self.use_gradio and self._client:
+            # En muchos Spaces, Gradio acepta lista de imágenes + texto JSON.
+            out = self._client.predict(image_paths, json.dumps(context or {}), api_name="/predict")
+            # Debe devolver lista de descripciones (una por imagen)
+            if isinstance(out, str):
+                try:
+                    return json.loads(out)
+                except Exception:
+                    return [out]
+            return list(out)
+        data = {"images": image_paths, "context": context or {}, **kwargs}
+        res = self._post_json("/describe", data)
+        return res.get("descriptions", [])
+class ToolsClient(BaseRemoteClient):
+    """Cliente para Space con tool-calling (stools)."""
+    def chat(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, **kwargs) -> Dict[str, Any]:
+        if self.use_gradio and self._client:
+            out = self._client.predict(json.dumps(messages), json.dumps(tools or []), api_name="/predict")
+            if isinstance(out, str):
+                try:
+                    return json.loads(out)
+                except Exception:
+                    return {"text": out}
+            return out
+        data = {"messages": messages, "tools": tools or [], **kwargs}
+        return self._post_json("/chat", data)
+class ASRClient(BaseRemoteClient):
+    """Cliente para Space de ASR catalán (ars)."""
+    def transcribe(self, audio_path: str, **kwargs) -> Dict[str, Any]:
+        if self.use_gradio and self._client:
+            out = self._client.predict(audio_path, api_name="/predict")
+            if isinstance(out, str):
+                return {"text": out}
+            return out
+        files = {"file": open(audio_path, "rb")}
+        headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else {}
+        r = requests.post(f"{self.base_url}/transcribe", files=files, data=kwargs, headers=headers, timeout=self.timeout)
+        r.raise_for_status()
+        return r.json()

requirements.txt CHANGED Viewed

@@ -1,6 +1,40 @@
-fastapi==0.115.0
 uvicorn[standard]==0.30.6
-pydantic==2.9.2
 python-multipart==0.0.9
-requests==2.32.3
-huggingface_hub==0.25.2

+# API
+fastapi==0.114.2
 uvicorn[standard]==0.30.6
 python-multipart==0.0.9
+pydantic>=2.7,<3
+PyYAML>=6.0
+requests>=2.32
+gradio_client>=0.16.0   # para invocar Spaces con interfaz Gradio
+# Core utils
+numpy>=1.26
+Pillow>=10.4
+opencv-python-headless==4.10.0.84
+# OCR (elige según tu vision_tools)
+pytesseract>=0.3     # si usas Tesseract
+easyocr>=1.7         # si prefieres EasyOCR
+# Audio / vídeo (ajusta según audio_tools)
+librosa>=0.10
+soundfile>=0.12
+pydub>=0.25
+ffmpeg-python>=0.2
+# (recuerda que el binario de ffmpeg no llega por pip; si lo necesitas, usa Docker o runtime con ffmpeg instalado)
+# ML ligero para clustering OCR
+scikit-learn>=1.5
+sentence-transformers>=3.0
+transformers>=4.44
+# Torch CPU (para sentence-transformers). En Spaces CPU, instala la variante de CPU:
+torch>=2.3,<3
+# Vector DB
+chromadb>=0.5.4
+# (Opcional) moviepy si tu vision_tools la usa
+moviepy>=2.0
+# (Opcional) herramientas de robustez
+tenacity>=8.2

scripts/client_example.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# =====================================
+# File: client_example.py (opcional)
+# =====================================
+import requests
+class VeureuEngineClient:
+    def __init__(self, base_url: str):
+        self.base = base_url.rstrip("/")
+    def process_video(self, video_path: str, **kwargs):
+        with open(video_path, "rb") as f:
+            files = {"video_file": (Path(video_path).name, f, "video/mp4")}
+            data = {"config_path": kwargs.get("config_path", "config_veureu.yaml"),
+                    "out_root": kwargs.get("out_root", "results"),
+                    "db_dir": kwargs.get("db_dir", "chroma_db")}
+            r = requests.post(f"{self.base}/process_video", files=files, data=data, timeout=3600)
+            r.raise_for_status()
+            return r.json()
+    def load_casting(self, faces_dir: str, voices_dir: str, db_dir: str = "chroma_db", drop_collections: bool = False):
+        data = {"faces_dir": faces_dir, "voices_dir": voices_dir, "db_dir": db_dir, "drop_collections": str(drop_collections)}
+        r = requests.post(f"{self.base}/load_casting", data=data, timeout=600)
+        r.raise_for_status(); return r.json()
+    def refine_narration(self, dialogues_srt: str, frame_descriptions: list, model_url: str, une_guidelines_path: str):
+        data = {
+            "dialogues_srt": dialogues_srt,
+            "frame_descriptions_json": json.dumps(frame_descriptions, ensure_ascii=False),
+            "model_url": model_url,
+            "une_guidelines_path": une_guidelines_path,
+        }
+        r = requests.post(f"{self.base}/refine_narration", data=data, timeout=600)
+        r.raise_for_status(); return r.json()

video_processing.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# ==================================
+# File: video_processing_refactor.py
+# (drop-in replacement for process_video_pipeline in video_processing.py)
+# ==================================
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from pathlib import Path
+import json
+import cv2
+import yaml
+import logging
+from chromadb.config import Settings
+import chromadb
+from audio_tools import process_audio_for_video
+from background_descriptor import build_keyframes_and_per_second, describe_keyframes_with_llm
+from identity_manager import IdentityManager
+log = logging.getLogger("video_processing")
+if not log.handlers:
+    h = logging.StreamHandler(); h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+    log.addHandler(h)
+log.setLevel(logging.INFO)
+def _ensure_dir(p: Path) -> Path:
+    p.mkdir(parents=True, exist_ok=True)
+    return p
+def _ensure_chroma(db_dir: str | Path):
+    _ensure_dir(Path(db_dir))
+    return chromadb.Client(Settings(
+        persist_directory=str(db_dir),
+        chroma_db_impl="duckdb+parquet",
+        anonymized_telemetry=False,
+    ))
+def load_config(path: str) -> Dict[str, Any]:
+    p = Path(path)
+    if not p.exists():
+        return {}
+    return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+def process_video_pipeline(
+    video_path: str,
+    *,
+    config_path: str = "config_veureu.yaml",
+    out_root: str = "results",
+    db_dir: str = "chroma_db",
+) -> Dict[str, Any]:
+    cfg = load_config(config_path)
+    out_dir = _ensure_dir(Path(out_root) / Path(video_path).stem)
+    # Metadatos del vídeo
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        raise RuntimeError(f"Cannot open video: {video_path}")
+    fps = float(cap.get(cv2.CAP_PROP_FPS)) or 25.0
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
+    duration = (total_frames / fps) if total_frames > 0 else 0.0
+    cap.release()
+    # DB Chroma opcional
+    face_col = voice_col = None
+    if cfg.get("database", {}).get("enabled", True):
+        client = _ensure_chroma(cfg.get("database", {}).get("persist_directory", db_dir))
+        if cfg.get("database", {}).get("enable_face_recognition", True):
+            try:
+                face_col = client.get_collection(cfg.get("database", {}).get("face_collection", "index_faces"))
+            except Exception:
+                face_col = None
+        if cfg.get("database", {}).get("enable_voice_recognition", True):
+            try:
+                voice_col = client.get_collection(cfg.get("database", {}).get("voice_collection", "index_voices"))
+            except Exception:
+                voice_col = None
+    # 1) Background descriptor (frames, OCR, descripciones)
+    keyframes, per_second, _ = build_keyframes_and_per_second(video_path, out_dir, cfg, face_collection=face_col)
+    # Ajustar `end` de cada keyframe con la duración total
+    for i in range(len(keyframes)):
+        if i < len(keyframes) - 1:
+            keyframes[i]["end"] = keyframes[i + 1]["start"]
+        else:
+            keyframes[i]["end"] = round(duration, 2)
+    # 2) Descripción con LLM
+    face_identities = {f.get("identity") for fr in per_second for f in (fr.get("faces") or []) if f.get("identity")}
+    keyframes, montage_path = describe_keyframes_with_llm(keyframes, out_dir, face_identities=face_identities, config_path=config_path)
+    # 3) Audio pipeline
+    audio_segments, srt_unmodified_path, full_transcription = process_audio_for_video(video_path=str(video_path), out_dir=out_dir, cfg=cfg, voice_collection=voice_col)
+    # 4) Identity manager: enriquecer frames y clips
+    im = IdentityManager(face_collection=face_col, voice_collection=voice_col)
+    per_second = im.assign_faces_to_frames(per_second)
+    keyframes = im.assign_faces_to_frames(keyframes)
+    audio_segments = im.assign_voices_to_segments(audio_segments, distance_threshold=cfg.get("voice_processing", {}).get("speaker_identification", {}).get("distance_threshold"))
+    # 5) Mapear identidades a rangos
+    keyframes = im.map_identities_over_ranges(per_second, keyframes, key="faces", out_key="persona")
+    audio_segments = im.map_identities_over_ranges(per_second, audio_segments, key="faces", out_key="persona")
+    # 6) Export analysis.json
+    frames_analysis = [{
+        "frame_number": fr.get("id"),
+        "start": fr.get("start"),
+        "end": fr.get("end"),
+        "ocr": fr.get("ocr", ""),
+        "persona": fr.get("persona", []),
+        "description": fr.get("description", ""),
+    } for fr in keyframes]
+    analysis = {
+        "frames": frames_analysis,
+        "audio_segments": [{k: v for k, v in seg.items() if k != "voice_embedding"} for seg in audio_segments],
+        "full_transcription": full_transcription,
+    }
+    analysis_path = out_dir / f"{Path(video_path).stem}_analysis.json"
+    analysis_path.write_text(json.dumps(analysis, indent=2, ensure_ascii=False), encoding="utf-8")
+    return {
+        "output_dir": str(out_dir),
+        "files": {
+            "montage_path": montage_path,
+            "srt_path": srt_unmodified_path,
+            "analysis_path": str(analysis_path),
+        },
+        "stats": {
+            "duration_seconds": duration,
+            "total_frames": total_frames,
+            "frames_processed": len(keyframes),
+            "audio_segments_processed": len(audio_segments),
+        },
+    }

vision_tools.py ADDED Viewed

	@@ -0,0 +1,573 @@

+# vision_tools.py
+# -----------------------------------------------------------------------------
+# Veureu — VISION utilities (self-contained)
+#  - Image processing and analysis
+#  - Object detection and recognition
+#  - Face detection and recognition
+#  - Scene description
+#  - Montage sequence analysis
+# -----------------------------------------------------------------------------
+from __future__ import annotations
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import json
+import logging
+import math
+import os
+import shlex
+import subprocess
+import numpy as np
+import torch
+import torchaudio
+import torchaudio.transforms as T
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from pyannote.audio import Pipeline as PyannotePipeline
+from speechbrain.pretrained import SpeakerRecognition
+from pydub import AudioSegment
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+from scenedetect import VideoManager, SceneManager
+from scenedetect.detectors import ContentDetector
+import os, base64, requests, subprocess, contextlib, time
+from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+from PIL import Image
+from libs.audio_tools_ana_2 import process_audio_for_video
+import cv2
+try:
+    import face_recognition  # type: ignore
+except Exception:
+    face_recognition = None  # type: ignore
+try:
+    # Utility wrapper for DeepFace
+    from libs.face_utils import FaceRecognizer as DFRecognizer
+except Exception:
+    try:
+        from face_utils import FaceRecognizer as DFRecognizer
+    except Exception:
+        DFRecognizer = None  # type: ignore
+try:
+    from deepface import DeepFace
+except ImportError:
+    DeepFace = None
+import easyocr
+# -------------------------------- Logging ------------------------------------
+log = logging.getLogger("audio_tools")
+if not log.handlers:
+    h = logging.StreamHandler()
+    h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
+    log.addHandler(h)
+log.setLevel(logging.INFO)
+# ============================ UTILS ===========================================
+def load_config(path: str = "configs/config_veureu.yaml") -> Dict[str, Any]:
+    p = Path(path)
+    if not p.exists():
+        log.warning("Config file not found: %s (using defaults)", path)
+        return {}
+    try:
+        import yaml
+        cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+        cfg["__path__"] = str(p)
+        return cfg
+    except Exception as e:
+        log.error("Failed to read YAML config: %s", e)
+        return {}
+# ---------------------------- IMAGE EMBEDDING ----------------------------------
+class FaceOfImageEmbedding:
+    """Preferred backend: `face_recognition`; fallback: DeepFace via libs.face_utils."""
+    def __init__(self, deepface_model: str = 'Facenet512'):
+        self.use_fr = face_recognition is not None
+        self.df = None
+        if not self.use_fr and DFRecognizer is not None:
+            try:
+                self.df = DFRecognizer(model_name=deepface_model)
+                log.info("Using DeepFace (%s) as face embedding backend.", deepface_model)
+            except Exception as e:
+                log.warning("Failed to initialize DeepFace: %s", e)
+        elif self.use_fr:
+            log.info("Using face_recognition as face embedding backend.")
+        else:
+            log.error("No face embedding backend available.")
+    def encode_image(self, image_path: Path) -> Optional[List[float]]:
+        import numpy as np
+        try:
+            if self.use_fr:
+                img = face_recognition.load_image_file(str(image_path))  # type: ignore
+                encs = face_recognition.face_encodings(img)
+                if encs:
+                    # Normalizar cada embedding a norma 1
+                    embeddings = [(e / np.linalg.norm(e)).astype(float).tolist() for e in encs]
+                    return embeddings
+                return None
+            if self.df is not None:
+                emb = self.df.get_face_embedding_from_path(str(image_path))
+                if emb is None:
+                    return None
+                # Convertir a numpy array y normalizar
+                emb = np.array(emb, dtype=float)
+                emb = emb / np.linalg.norm(emb)
+                return emb.tolist()
+        except Exception as e:
+            log.debug("Fallo embedding cara %s: %s", image_path, e)
+        return None
+class FaceAnalyzer:
+    """Wrapper sencillo para DeepFace que obtiene edad y género de una imagen."""
+    def __init__(self, actions=None):
+        if actions is None:
+            actions = ["age", "gender"]
+        self.actions = actions
+    def analyze_image(self, img_path: str) -> Optional[Dict[str, Any]]:
+        try:
+            result = DeepFace.analyze(img_path=img_path, actions=self.actions)
+            # Si DeepFace devuelve una lista (varias caras), tomamos la primera
+            if isinstance(result, list) and len(result) > 0:
+                result = result[0]
+            # Ahora sí podemos acceder a 'age' y 'dominant_gender'
+            return {
+                "age": result.get("age", "unknown"),
+                "gender": result.get("dominant_gender", "unknown")
+            }
+        except Exception as e:
+            log.warning("No se pudo analizar la imagen %s: %s", img_path, e)
+            return None
+# ----------------------------------- FUNCTIONS -------------------------------------
+def map_identities_per_second(frames_per_second, intervals):
+    for seg in intervals:
+        seg_start = seg["start"]
+        seg_end = seg["end"]
+        # recolectar identidades de los frames en el rango del segmento
+        identities = []
+        for f in frames_per_second:
+            if seg_start <= f["start"] <= seg_end:
+                for face in f.get("faces", []):
+                    identities.append(face)
+        # contar apariciones
+        seg["counts"] = dict(Counter(identities))
+    return intervals
+def _split_montage(img: np.ndarray, n: int, cfg: Dict[str, Any]) -> List[np.ndarray]:
+    vd = cfg.get('vision_describer', {})
+    montage_cfg = vd.get('montage', {})
+    mode = montage_cfg.get('split_mode', 'horizontal')  # 'horizontal'|'vertical'|'grid'
+    h, w = img.shape[:2]
+    tiles: List[np.ndarray] = []
+    if mode == 'vertical':
+        tile_h = h // n
+        for i in range(n):
+            y0 = i * tile_h; y1 = h if i == n-1 else (i+1) * tile_h
+            tiles.append(img[y0:y1, 0:w])
+        return tiles
+    if mode == 'grid':
+        rows = int(montage_cfg.get('rows', 1) or 1)
+        cols = int(montage_cfg.get('cols', n) or n)
+        assert rows * cols >= n, "grid rows*cols must be >= n"
+        tile_h = h // rows; tile_w = w // cols
+        k = 0
+        for r in range(rows):
+            for c in range(cols):
+                if k >= n: break
+                y0, y1 = r*tile_h, h if (r==rows-1) else (r+1)*tile_h
+                x0, x1 = c*tile_w, w if (c==cols-1) else (c+1)*tile_w
+                tiles.append(img[y0:y1, x0:x1]); k += 1
+        return tiles
+    tile_w = w // n
+    for i in range(n):
+        x0 = i * tile_w; x1 = w if i == n-1 else (i+1) * tile_w
+        tiles.append(img[0:h, x0:x1])
+    return tiles
+def generar_montage(frame_paths: List[str], output_dir: str) -> None:
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    montage_path = ""
+    if frame_paths:
+        imgs = [cv2.imread(kf) for kf in frame_paths if os.path.exists(kf)]
+        imgs = [img for img in imgs if img is not None]
+        print(f"Se encontraron {len(imgs)} imágenes para el montaje.")
+        if imgs:
+            h = max(img.shape[0] for img in imgs)  # altura máxima
+            imgs_resized = [cv2.resize(img, (int(img.shape[1]*h/img.shape[0]), h)) for img in imgs]
+            montage = cv2.hconcat(imgs_resized)
+            montage_path = os.path.join(output_dir, "keyframes_montage.jpg")
+            print(f"Guardando montaje en: {montage_path}")
+            cv2.imwrite(montage_path, montage)
+            print("Montaje guardado.")
+        else:
+            print("No se encontraron imágenes válidas para el montaje.")
+    return montage_path
+def describe_montage_sequence(
+    montage_path: str,
+    n: int,
+    informacion,
+    face_identities,
+    *,
+    config_path: str = 'configs/config_veureu.yaml'
+) -> Dict[str, Any]:
+    """Describe each sub-image of a montage.
+    Inputs
+    ------
+    montage_path: str
+        Path to a composite image made of n sub-ismages placed sequentially.
+    n: int
+        Number of sub-images to split and describe.
+    config_path: str
+        Path to YAML with 'vision_describer' configuration (provider and params).
+    Returns
+    -------
+    list []: with the descripcion of each image
+    """
+    path_model = "BSC-LT/salamandra-7b-vision"
+    processor = AutoProcessor.from_pretrained(path_model)
+    model = LlavaOnevisionForConditionalGeneration.from_pretrained(
+        path_model,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True
+    ).to("cuda")
+    img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
+    if img is None:
+        raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
+    cfg = load_config(config_path)
+    tiles = _split_montage(img, n, cfg)
+    if len(tiles) < n:
+        raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
+    # Convertir cada tile a PIL Image
+    tile_images = [Image.fromarray(cv2.cvtColor(t, cv2.COLOR_BGR2RGB)) for t in tiles]
+    sys_prompt = (
+        "Ets un expert en narrativa visual. "
+        "Descriu la imatge de manera molt breu i senzilla, en català, "
+        "explicant només l’acció principal que s’hi veu. "
+        "Respon amb una sola frase curta (10–20 paraules com a màxim), "
+        "sense afegir detalls innecessaris ni descriure l’entorn."
+    )
+    all_results = []
+    for i in range(len(tile_images)):
+        batch = [tile_images[i]]  # lista con un solo tile
+        conversation = [
+            {"role": "system", "content": sys_prompt},
+            {"role": "user", "content": [
+                {"type": "image", "image": batch[0]},
+                {"type": "text", "text": (
+                    f"Descriu la imatge de manera molt breu i senzilla, en català. ")}
+            ]}
+        ]
+        prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
+        inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
+        for k, v in inputs.items():
+            if v.dtype.is_floating_point:
+                inputs[k] = v.to("cuda", torch.float16)
+            else:
+                inputs[k] = v.to("cuda")
+        output = model.generate(**inputs, max_new_tokens=1024)
+        text = processor.decode(output[0], skip_special_tokens=True)
+        lines = text.split("\n")
+        desc = ""
+        for i, line in enumerate(lines):
+            if line.lower().startswith(" assistant"):
+                desc = "\n".join(lines[i+1:]).strip()
+                break
+        all_results.append(desc)
+        torch.cuda.empty_cache()
+    return all_results
+# --------------------------- IMAGES EXTRACTION -----------------------------
+def keyframe_conditional_extraction_ana(
+    video_path,
+    output_dir,
+    threshold=30.0,
+    offset_frames=10
+):
+    """
+    Detecta cambios de escena en un vídeo, guarda un fotograma por cada cambio,
+    devuelve intervalos con start y end basados en los tiempos de los keyframes
+    y genera un montaje con todos los keyframes.
+    """
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    video_manager = VideoManager([video_path])
+    scene_manager = SceneManager()
+    scene_manager.add_detector(ContentDetector(threshold=threshold))
+    video_manager.start()
+    scene_manager.detect_scenes(video_manager)
+    scene_list = scene_manager.get_scene_list()
+    cap = cv2.VideoCapture(video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
+    video_duration = total_frames / fps
+    keyframes = []
+    for i, (start_time, end_time) in enumerate(scene_list):
+        frame_number = int(start_time.get_frames()) + offset_frames
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+        ret, frame = cap.read()
+        if ret:
+            ts = frame_number / fps
+            frame_path = os.path.join(output_dir, f"scene_{i+1:03d}.jpg")
+            cv2.imwrite(frame_path, frame)
+            keyframes.append({
+                "index": i+1,
+                "time": round(ts, 2),
+                "path": frame_path
+            })
+    cap.release()
+    video_manager.release()
+    # Construimos intervalos con start y end
+    intervals = []
+    for i, kf in enumerate(keyframes):
+        start = kf["time"]
+        if i < len(keyframes) - 1:
+            end = keyframes[i+1]["time"]
+        else:
+            end = video_duration  # última escena hasta el final
+        intervals.append({
+            "index": kf["index"],
+            "start": start,
+            "end": round(end, 2),
+            "path": kf["path"]
+        })
+    return intervals
+def keyframe_every_second(
+    video_path: str,
+    output_dir: str = ".",
+    max_frames: Optional[int] = 10000,
+) -> List[dict]:
+    """
+    Extrae un fotograma por cada segundo del video.
+    Returns:
+        List[dict]: Cada elemento es {"index", "start", "end", "path"}
+    """
+    out_dir = Path(output_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    cap = cv2.VideoCapture(str(video_path))
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration = total_frames / fps
+    frames: List[dict] = []
+    idx = 0
+    sec = 0.0
+    while sec <= duration:
+        frame_number = int(sec * fps)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
+        ret, frame = cap.read()
+        if not ret:
+            break
+        timestamp = frame_number / fps
+        frame_path = out_dir / f"frame_per_second{idx:03d}.jpg"
+        cv2.imwrite(str(frame_path), frame)
+        frames.append({
+            "index": idx + 1,
+            "start": round(timestamp, 2),
+            "end": None,   # lo completamos después
+            "path": str(frame_path),
+        })
+        idx += 1
+        sec += 1.0
+        if max_frames and idx >= max_frames:
+            break
+    cap.release()
+    # Completar los "end" con el inicio del siguiente frame
+    for i in range(len(frames)):
+        if i < len(frames) - 1:
+            frames[i]["end"] = frames[i+1]["start"]
+        else:
+            frames[i]["end"] = round(duration, 2)
+    return frames
+from collections import Counter, defaultdict
+# --------------------------- FRAMES PROCESSING -----------------------------
+def process_frames(
+    frames: List[dict],  # cada elemento es {"index", "start", "end", "path"}
+    config: dict,
+    face_col=None,
+    embedding_model=None,
+) -> Tuple[List[dict], List[int]]:
+    """
+    Procesa keyframes:
+    - Detecta caras
+    - Genera embeddings con FaceEmbedding
+    - Opcionalmente compara con face_col (KNN top-3)
+    - Opcionalmente ejecuta OCR
+    """
+    frame_results = []
+    # Crear embedding_model si no se pasa
+    if embedding_model is None:
+        from libs.vision_tools import FaceOfImageEmbedding
+        embedding_model = FaceOfImageEmbedding()
+    for idx, frame in enumerate(frames):
+        frame_path = frame["path"]
+        try:
+            raw_faces = embedding_model.encode_image(Path(frame_path))
+        except Exception as e:
+            print(f"Error procesando {frame_path}: {e}")
+            raw_faces = None
+        faces = []
+        if raw_faces is not None:
+            if isinstance(raw_faces[0], list):  # múltiples
+                for e in raw_faces:
+                    faces.append({"embedding": e})
+            else:  # uno solo
+                faces.append({"embedding": raw_faces})
+        faces_detected = []
+        for f in faces:
+            embedding = f.get("embedding")
+            identity = "Unknown"
+            knn = []
+            if face_col is not None and embedding is not None:
+                try:
+                    num_embeddings = face_col.count()
+                    if num_embeddings < 1:
+                        knn = []
+                        identity = "Unknown"
+                    else:
+                        n_results = min(3, num_embeddings)
+                        q = face_col.query(
+                            query_embeddings=[embedding],
+                            n_results=n_results,
+                            include=["metadatas", "distances"]
+                        )
+                        knn = []
+                        metas = q.get("metadatas", [[]])[0]
+                        dists = q.get("distances", [[]])[0]
+                        for meta, dist in zip(metas, dists):
+                            person_id = meta.get("identity", "Unknown") if isinstance(meta, dict) else "Unknown"
+                            knn.append({"identity": person_id, "distance": float(dist)})
+                        if knn and knn[0]["distance"] < 0.6:
+                            identity = knn[0]["identity"]
+                        else:
+                            identity = "Unknown"
+                except Exception as e:
+                    print(f"Face KNN failed: {e}")
+                    knn = []
+                    identity = "Unknown"
+            faces_detected.append(identity)
+        use_easyocr = True
+        if use_easyocr:
+            try:
+                reader = easyocr.Reader(['en', 'es'], gpu=True)  # Cambiar gpu=False si no hay GPU
+                results = reader.readtext(frame_path)
+                ocr_text_easyocr = " ".join([text for _, text, _ in results]).strip()
+            except Exception as e:
+                print(f"OCR error: {e}")
+        frame_results.append({
+            "id": frame["index"],
+            "start": frame["start"],
+            "end": frame["end"],
+            "image_path": frame_path,
+            "faces": faces_detected,
+            "ocr": ocr_text_easyocr,
+        })
+    return frame_results
+if __name__ == "__main__":
+    import argparse
+    ap = argparse.ArgumentParser(description="Veureu — Audio tools (self-contained)")
+    ap.add_argument("--video", required=True)
+    ap.add_argument("--out", default="results")
+    ap.add_argument("--config", default="configs/config_veureu.yaml")
+    args = ap.parse_args()
+    # Lightweight config loader (only for sample run)
+    import yaml
+    cfg = {}
+    p = Path(args.config)
+    if p.exists():
+        cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+    out_dir = Path(args.out) / Path(args.video).stem
+    out_dir.mkdir(parents=True, exist_ok=True)
+    segs, srt = process_audio_for_video(args.video, out_dir, cfg, voice_collection=None)
+    print(json.dumps({
+        "segments": len(segs),
+        "srt": srt
+    }, indent=2, ensure_ascii=False))