VeuReu commited on
Commit
b17b915
·
verified ·
1 Parent(s): 7e13059

Upload 15 files

Browse files
LICENSE ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 - Dive-tech
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND.
README.md CHANGED
@@ -1,10 +1,22 @@
 
 
 
 
1
  ---
2
- title: veureu-engine
3
- emoji: 📉
4
- colorFrom: green
5
  colorTo: blue
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
1
+ # ============================
2
+ # README (para Hugging Face Space, FastAPI)
3
+ # ============================
4
+ """
5
  ---
6
+ title: Veureu Engine (FastAPI)
7
+ emoji: 🎧
8
+ colorFrom: gray
9
  colorTo: blue
10
  sdk: docker
11
+ app_file: main_api.py
12
  pinned: false
13
  ---
14
 
15
+ # Veureu Engine API
16
+
17
+ Endpoints:
18
+ - `POST /process_video` — orquesta `video_processing.py` (audio_tools + vision_tools + identity/background managers).
19
+ - `POST /load_casting` — construye índices de caras y voces en ChromaDB.
20
+ - `POST /refine_narration` — invoca `narration_system.py` para refinar narración y SRT según UNE-153010.
21
+
22
+ """
audio_tools.py ADDED
@@ -0,0 +1,725 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # audio_tools.py
2
+ # -----------------------------------------------------------------------------
3
+ # Veureu — AUDIO utilities (self-contained)
4
+ # - FFmpeg extraction (WAV)
5
+ # - Diarization (pyannote)
6
+ # - ASR:
7
+ # * Catalan ("ca") -> AINA Whisper
8
+ # * Other languages -> Lightweight generic Whisper
9
+ # - Integrated Language ID (Whisper via faster-whisper)
10
+ # - Voice embeddings (SpeechBrain ECAPA)
11
+ # - Speaker identification (KMeans + optional ChromaDB collection)
12
+ # - SRT generation
13
+ # - Orchestrator: process_audio_for_video(...)
14
+ # - ADDED: ASR of full audio and LLM-based SRT correction
15
+ # -----------------------------------------------------------------------------
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import json
23
+ import logging
24
+ import math
25
+ import os
26
+ import shlex
27
+ import subprocess
28
+
29
+ import numpy as np
30
+ import torch
31
+ import torchaudio
32
+ import torchaudio.transforms as T
33
+ from pydub import AudioSegment
34
+ from pyannote.audio import Pipeline
35
+ from speechbrain.pretrained import SpeakerRecognition
36
+ from sklearn.cluster import KMeans
37
+ from sklearn.metrics import silhouette_score
38
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
39
+ from openai import OpenAI as OpenAIClient
40
+ import noisereduce as nr
41
+
42
+ # -------------------------------- Logging ------------------------------------
43
+ log = logging.getLogger("audio_tools")
44
+ if not log.handlers:
45
+ _h = logging.StreamHandler()
46
+ _h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
47
+ log.addHandler(_h)
48
+ log.setLevel(logging.INFO)
49
+
50
+ # ------------------------------- Utilities -----------------------------------
51
+
52
+ def _pick_device_auto(dev_cfg: str) -> str:
53
+ """Resolve 'auto' device to cuda/cpu."""
54
+ if dev_cfg == "auto":
55
+ return "cuda" if torch.cuda.is_available() else "cpu"
56
+ return dev_cfg
57
+
58
+ def load_config(path: str = "configs/config_veureu.yaml") -> Dict[str, Any]:
59
+ p = Path(path)
60
+ if not p.exists():
61
+ log.warning("Config file not found: %s (using defaults)", path)
62
+ return {}
63
+ try:
64
+ import yaml
65
+ cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
66
+ cfg["__path__"] = str(p)
67
+ return cfg
68
+ except Exception as e:
69
+ log.error("Failed to read YAML config: %s", e)
70
+ return {}
71
+
72
+ # ------------------------------- Extraction ----------------------------------
73
+
74
+ def extract_audio_ffmpeg(
75
+ video_path: str,
76
+ audio_out: Path,
77
+ sr: int = 16000,
78
+ mono: bool = True,
79
+ ) -> str:
80
+ """Extract audio from video to WAV using ffmpeg."""
81
+ audio_out.parent.mkdir(parents=True, exist_ok=True)
82
+ cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
83
+ subprocess.run(
84
+ shlex.split(cmd),
85
+ check=True,
86
+ stdout=subprocess.DEVNULL,
87
+ stderr=subprocess.DEVNULL,
88
+ )
89
+ return str(audio_out)
90
+
91
+ # ----------------------------------- ASR -------------------------------------
92
+
93
+ @dataclass
94
+ class AinaASR:
95
+ """ASR for Catalan using the AINA Whisper model."""
96
+ model_name: str = "projecte-aina/whisper-large-v3-ca-3catparla"
97
+ device: str = "cuda"
98
+
99
+ def __post_init__(self):
100
+ dev = self.device
101
+ if dev == "cuda" and not torch.cuda.is_available():
102
+ dev = "cpu"
103
+ self.processor = WhisperProcessor.from_pretrained(self.model_name)
104
+ self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(dev)
105
+ self.device = dev
106
+ log.info(f"ASR AINA loaded on {self.device}: {self.model_name}")
107
+
108
+ def transcribe_wav(self, wav_path: str) -> str:
109
+ waveform, sr = torchaudio.load(wav_path)
110
+ inputs = self.processor(
111
+ waveform.numpy(), sampling_rate=sr, return_tensors="pt"
112
+ ).input_features.to(self.model.device)
113
+ with torch.no_grad():
114
+ ids = self.model.generate(inputs, max_new_tokens=440)[0]
115
+ txt = self.processor.decode(ids)
116
+ norm = getattr(self.processor.tokenizer, "_normalize", None)
117
+ return norm(txt) if callable(norm) else txt
118
+
119
+ def transcribe_long_audio(
120
+ self,
121
+ wav_path: str,
122
+ chunk_length_s: int = 20,
123
+ overlap_s: int = 2,
124
+ ) -> str:
125
+ waveform, sr = torchaudio.load(wav_path)
126
+ total_samples = waveform.shape[1]
127
+ chunk_size = chunk_length_s * sr
128
+ overlap_size = overlap_s * sr
129
+
130
+ transcriptions = []
131
+ start = 0
132
+
133
+ while start < total_samples:
134
+ end = min(start + chunk_size, total_samples)
135
+ chunk = waveform[:, start:end]
136
+
137
+ input_features = self.processor(
138
+ chunk.numpy(),
139
+ sampling_rate=sr,
140
+ return_tensors="pt"
141
+ ).input_features.to(self.model.device)
142
+
143
+ with torch.no_grad():
144
+ predicted_ids = self.model.generate(
145
+ input_features,
146
+ max_new_tokens=440,
147
+ num_beams=1, # puedes probar beam search
148
+ )[0]
149
+
150
+ text = self.processor.decode(predicted_ids, skip_special_tokens=True)
151
+ transcriptions.append(text.strip())
152
+
153
+ # avanzar con solapamiento
154
+ start += chunk_size - overlap_size
155
+
156
+ return " ".join(transcriptions).strip()
157
+
158
+ @dataclass
159
+ class WhisperASR:
160
+ """Lightweight generic ASR based on Whisper for non-Catalan languages."""
161
+ model_name: str = "openai/whisper-small" # change to 'base' for an even lighter model
162
+ device: str = "cuda"
163
+ language: Optional[str] = None # force language, e.g. "es", "en", etc.
164
+
165
+ def __post_init__(self):
166
+ dev = self.device
167
+ if dev == "cuda" and not torch.cuda.is_available():
168
+ dev = "cpu"
169
+ self.processor = WhisperProcessor.from_pretrained(self.model_name)
170
+ self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(dev)
171
+ self.device = dev
172
+ log.info(f"ASR Whisper loaded on {self.device}: {self.model_name} (lang hint: {self.language})")
173
+
174
+ def transcribe_wav(self, wav_path: str) -> str:
175
+ waveform, sr = torchaudio.load(wav_path)
176
+ inputs = self.processor(
177
+ waveform.numpy(), sampling_rate=sr, return_tensors="pt"
178
+ ).input_features.to(self.model.device)
179
+
180
+ gen_kwargs: Dict[str, Any] = dict(max_new_tokens=444)
181
+ if self.language and self.language != "auto":
182
+ try:
183
+ forced_ids = self.processor.get_decoder_prompt_ids(
184
+ language=self.language, task="transcribe"
185
+ )
186
+ gen_kwargs["forced_decoder_ids"] = forced_ids
187
+ except Exception:
188
+ # If the model/processor does not support forced ids, continue without forcing
189
+ pass
190
+
191
+ with torch.no_grad():
192
+ ids = self.model.generate(inputs, **gen_kwargs)[0]
193
+ txt = self.processor.decode(ids)
194
+ norm = getattr(self.processor.tokenizer, "_normalize", None)
195
+ return norm(txt) if callable(norm) else txt
196
+
197
+ # ------------------------------ Language ID ----------------------------------
198
+
199
+ @dataclass
200
+ class WhisperLIDConfig:
201
+ """Configuration for language detection with faster-whisper."""
202
+ model_name: str = "Systran/faster-whisper-small"
203
+ device: str = "auto"
204
+ compute_type: str = "float32" # "int8" | "float16" | "float32"
205
+ beam_size: int = 1
206
+ chunk_seconds: float = 30.0
207
+ prob_threshold: float = 0.5
208
+ fallback_lang: str = "auto"
209
+
210
+ def detect_language_with_whisper(
211
+ wav_path: str,
212
+ cfg: Dict[str, Any],
213
+ ) -> Tuple[str, float]:
214
+ """
215
+ Detects language using faster-whisper (WhisperModel). Returns (lang_iso, prob).
216
+ In case of failure, returns (fallback_lang, 0.0).
217
+ """
218
+ lid_cfg_d = (cfg.get("asr", {})
219
+ .get("language_detection", {})
220
+ .get("whisper_lid", {}))
221
+ lid_cfg = WhisperLIDConfig(
222
+ model_name=lid_cfg_d.get("model_name", "Systran/faster-whisper-small",),
223
+ device=_pick_device_auto(lid_cfg_d.get("device", "auto")),
224
+ compute_type=lid_cfg_d.get("compute_type", "float32"),
225
+ beam_size=int(lid_cfg_d.get("beam_size", 1)),
226
+ chunk_seconds=float(lid_cfg_d.get("chunk_seconds", 30.0)),
227
+ prob_threshold=float(lid_cfg_d.get("prob_threshold", 0.5)),
228
+ fallback_lang=lid_cfg_d.get("fallback_lang", "auto"),
229
+ )
230
+
231
+ try:
232
+ from faster_whisper import WhisperModel # type: ignore
233
+ except Exception as e:
234
+ log.warning(f"LID: faster-whisper not available ({e}). Fallback='{lid_cfg.fallback_lang}'")
235
+ return lid_cfg.fallback_lang, 0.0
236
+
237
+ try:
238
+ model = WhisperModel(lid_cfg.model_name, device=lid_cfg.device, compute_type=lid_cfg.compute_type)
239
+ except Exception as e:
240
+ log.warning(f"LID: failed to load '{lid_cfg.model_name}': {e}. Fallback='{lid_cfg.fallback_lang}'")
241
+ return lid_cfg.fallback_lang, 0.0
242
+
243
+ try:
244
+ segments, info = model.transcribe(
245
+ wav_path,
246
+ beam_size=lid_cfg.beam_size,
247
+ vad_filter=True,
248
+ without_timestamps=True,
249
+ language=None
250
+ )
251
+ lang = info.language or lid_cfg.fallback_lang
252
+ prob = float(info.language_probability or 0.0)
253
+ if prob < lid_cfg.prob_threshold:
254
+ return lid_cfg.fallback_lang, prob
255
+ return lang, prob
256
+ except Exception as e:
257
+ log.warning(f"LID: error in transcription/detection: {e}. Fallback='{lid_cfg.fallback_lang}'")
258
+ return lid_cfg.fallback_lang, 0.0
259
+
260
+
261
+ def _build_asr_backend_for_language(lang_iso: str, cfg: Dict[str, Any]):
262
+ """
263
+ Selects ASR backend based on language:
264
+ - 'ca' -> AINA
265
+ - other -> Generic Whisper
266
+ """
267
+ asr_cfg = cfg.get("asr", {})
268
+ device_pref = _pick_device_auto(asr_cfg.get("device", "auto"))
269
+ if lang_iso and lang_iso.lower() == "ca":
270
+ return AinaASR(
271
+ model_name=asr_cfg.get("model_name", "projecte-aina/whisper-large-v3-ca-3catparla"),
272
+ device=device_pref,
273
+ )
274
+ else:
275
+ return WhisperASR(
276
+ model_name=asr_cfg.get("whisper_model_name", "openai/whisper-small"),
277
+ device=device_pref,
278
+ language=None,
279
+ )
280
+
281
+ # -------------------------------- Diarization --------------------------------
282
+ from pathlib import Path
283
+ from typing import List, Dict, Any, Tuple
284
+ from pydub import AudioSegment
285
+ from pyannote.audio import Pipeline
286
+ import math
287
+
288
+ def diarize_audio(
289
+ wav_path: str,
290
+ base_dir: Path,
291
+ clips_folder: str = "clips",
292
+ min_segment_duration: float = 20,
293
+ max_segment_duration: float = 50.0,
294
+ hf_token_env: str | None = None,
295
+ ) -> Tuple[List[str], List[Dict[str, Any]]]:
296
+ """Diarization with pyannote and clip export with pydub.
297
+ Returns clip paths and segments [{'start','end','speaker'}].
298
+ """
299
+ audio = AudioSegment.from_wav(wav_path)
300
+ duration = len(audio) / 1000.0
301
+
302
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=(hf_token_env
303
+ or os.getenv("HF_TOKEN")))
304
+ diarization = pipeline(wav_path)
305
+
306
+ clips_dir = (base_dir / clips_folder)
307
+ clips_dir.mkdir(parents=True, exist_ok=True)
308
+ clip_paths: List[str] = []
309
+ segments: List[Dict[str, Any]] = []
310
+ spk_map: Dict[str, int] = {}
311
+
312
+ prev_end = 0.0 # referencia al final del último segmento exportado
313
+
314
+ for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
315
+ start, end = max(0.0, float(turn.start)), min(duration, float(turn.end))
316
+
317
+ if start < prev_end:
318
+ start = prev_end
319
+ if end <= start:
320
+ continue
321
+
322
+ seg_dur = end - start
323
+ if seg_dur < min_segment_duration:
324
+ continue
325
+
326
+ if seg_dur > max_segment_duration:
327
+ # split long segments
328
+ n = int(math.ceil(seg_dur / max_segment_duration))
329
+ sub_d = seg_dur / n
330
+ for j in range(n):
331
+ s = start + j * sub_d
332
+ e = min(end, start + (j + 1) * sub_d)
333
+ if e <= s:
334
+ continue
335
+ clip = audio[int(s * 1000):int(e * 1000)]
336
+ cp = clips_dir / f"segment_{i:03d}_{j:02d}.wav"
337
+ clip.export(cp, format="wav")
338
+ if speaker not in spk_map:
339
+ spk_map[speaker] = len(spk_map)
340
+ segments.append({
341
+ "start": s,
342
+ "end": e,
343
+ "speaker": f"SPEAKER_{spk_map[speaker]:02d}"
344
+ })
345
+ clip_paths.append(str(cp))
346
+ prev_end = e
347
+ else:
348
+ clip = audio[int(start * 1000):int(end * 1000)]
349
+ cp = clips_dir / f"segment_{i:03d}.wav"
350
+ clip.export(cp, format="wav")
351
+ if speaker not in spk_map:
352
+ spk_map[speaker] = len(spk_map)
353
+ segments.append({
354
+ "start": start,
355
+ "end": end,
356
+ "speaker": f"SPEAKER_{spk_map[speaker]:02d}"
357
+ })
358
+ clip_paths.append(str(cp))
359
+ prev_end = end # actualizar referencia
360
+
361
+ if not segments:
362
+ # fallback single clip
363
+ cp = clips_dir / "segment_000.wav"
364
+ audio.export(cp, format="wav")
365
+ return [str(cp)], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
366
+
367
+ # sort by start time
368
+ pairs = sorted(zip(clip_paths, segments), key=lambda x: x[1]["start"])
369
+ clip_paths, segments = [p[0] for p in pairs], [p[1] for p in pairs]
370
+ return clip_paths, segments
371
+
372
+ # ------------------------------ Voice embeddings -----------------------------
373
+
374
+ class VoiceEmbedder:
375
+ def __init__(self):
376
+ self.model = SpeakerRecognition.from_hparams(
377
+ source="speechbrain/spkrec-ecapa-voxceleb",
378
+ savedir="pretrained_models/spkrec-ecapa-voxceleb",
379
+ )
380
+ self.model.eval()
381
+
382
+ def embed(self, wav_path: str) -> List[float]:
383
+ waveform, sr = torchaudio.load(wav_path)
384
+ target_sr = 16000
385
+ if sr != target_sr:
386
+ waveform = T.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
387
+ if waveform.shape[0] > 1:
388
+ waveform = waveform.mean(dim=0, keepdim=True)
389
+ # ensure minimum length (~0.2s) for stability
390
+ min_samples = int(0.2 * target_sr)
391
+ if waveform.shape[1] < min_samples:
392
+ pad = min_samples - waveform.shape[1]
393
+ waveform = torch.cat([waveform, torch.zeros((1, pad))], dim=1)
394
+ with torch.no_grad():
395
+ emb = (
396
+ self.model.encode_batch(waveform)
397
+ .squeeze()
398
+ .cpu()
399
+ .numpy()
400
+ .astype(float)
401
+ )
402
+ return emb.tolist()
403
+
404
+
405
+ def embed_voice_segments(clip_paths: List[str]) -> List[List[float]]:
406
+ ve = VoiceEmbedder()
407
+ out: List[List[float]] = []
408
+ for cp in clip_paths:
409
+ try:
410
+ out.append(ve.embed(cp))
411
+ except Exception as e:
412
+ log.warning(f"Embedding error in {cp}: {e}")
413
+ out.append([])
414
+ return out
415
+
416
+ # --------------------------- Speaker identification --------------------------
417
+
418
+ def identify_speakers(
419
+ embeddings: List[List[float]],
420
+ voice_collection, # ChromaDB collection with .query or None
421
+ cfg: Dict[str, Any],
422
+ ) -> List[str]:
423
+ voice_cfg = cfg.get("voice_processing", {}).get("speaker_identification", {})
424
+ if not embeddings or sum(1 for e in embeddings if e) < 2:
425
+ return ["SPEAKER_00" for _ in embeddings]
426
+
427
+ valid = [e for e in embeddings if e and len(e) > 0]
428
+ if len(valid) < 2:
429
+ return ["SPEAKER_00" for _ in embeddings]
430
+
431
+ min_clusters = max(1, voice_cfg.get("min_speakers", 1))
432
+ max_clusters = min(voice_cfg.get("max_speakers", 5), len(valid) - 1)
433
+
434
+ # buscar k óptimo usando silhouette_score
435
+ if voice_cfg.get("find_optimal_clusters", True) and len(valid) > 2:
436
+ best_score = -1.0
437
+ best_k = min_clusters
438
+ for k in range(min_clusters, max_clusters + 1):
439
+ if k >= len(valid):
440
+ break
441
+ km = KMeans(n_clusters=k, random_state=42, n_init="auto")
442
+ labels = km.fit_predict(valid)
443
+ if len(set(labels)) > 1:
444
+ score = silhouette_score(valid, labels)
445
+ if score > best_score:
446
+ best_score, best_k = score, k
447
+ else:
448
+ best_k = min(max_clusters, max(min_clusters, voice_cfg.get("num_speakers", 2)))
449
+ best_k = max(1, min(best_k, len(valid) - 1))
450
+
451
+ # clustering final
452
+ km = KMeans(n_clusters=best_k, random_state=42, n_init="auto", init="k-means++")
453
+ labels = km.fit_predict(np.array(valid))
454
+ centers = km.cluster_centers_
455
+
456
+ cluster_to_name: Dict[int, str] = {}
457
+ unknown_counter = 0
458
+ for cid in range(best_k):
459
+ center = centers[cid].tolist()
460
+ name = f"SPEAKER_{cid:02d}"
461
+
462
+ if voice_collection is not None:
463
+ try:
464
+ q = voice_collection.query(query_embeddings=[center], n_results=1)
465
+ metas = q.get("metadatas", [[]])[0]
466
+ dists = q.get("distances", [[]])[0]
467
+ thr = voice_cfg.get("distance_threshold")
468
+
469
+ if dists and thr is not None and dists[0] > thr:
470
+ # nuevo hablante → marcar como UNKNOWN y guardar en la colección
471
+ name = f"UNKNOWN_{unknown_counter}"
472
+ unknown_counter += 1
473
+ voice_collection.add(
474
+ embeddings=[center],
475
+ metadatas=[{"name": name}],
476
+ ids=[f"unk_{cid}_{unknown_counter}"]
477
+ )
478
+ else:
479
+ # coincidencia aceptable → usar nombre existente
480
+ if metas and isinstance(metas[0], dict):
481
+ name = metas[0].get("nombre") or metas[0].get("name") \
482
+ or metas[0].get("speaker") or metas[0].get("identity") \
483
+ or name
484
+ except Exception as e:
485
+ log.warning(f"Voice KNN query failed: {e}")
486
+
487
+ cluster_to_name[cid] = name
488
+
489
+ # mapear cada embedding a su hablante
490
+ personas: List[str] = []
491
+ vi = 0
492
+ for emb in embeddings:
493
+ if not emb:
494
+ personas.append("UNKNOWN")
495
+ else:
496
+ label = int(labels[vi])
497
+ personas.append(cluster_to_name.get(label, f"SPEAKER_{label:02d}"))
498
+ vi += 1
499
+
500
+ return personas
501
+
502
+ # ----------------------------------- SRT -------------------------------------
503
+
504
+ def _fmt_srt_time(seconds: float) -> str:
505
+ h = int(seconds // 3600)
506
+ m = int((seconds % 3600) // 60)
507
+ s = int(seconds % 60)
508
+ ms = int(round((seconds - int(seconds)) * 1000))
509
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
510
+
511
+
512
+ def generate_srt_from_diarization(
513
+ diarization_segments: List[Dict[str, Any]],
514
+ transcriptions: List[str],
515
+ speakers_per_segment: List[str],
516
+ output_srt_path: str,
517
+ cfg: Dict[str, Any],
518
+ ) -> None:
519
+ subs = cfg.get("subtitles", {})
520
+ max_cpl = int(subs.get("max_chars_per_line", 42))
521
+ max_lines = int(subs.get("max_lines_per_cue", 10))
522
+ speaker_display = subs.get("speaker_display", "brackets")
523
+
524
+ items: List[Dict[str, Any]] = []
525
+ n = min(len(diarization_segments), len(transcriptions), len(speakers_per_segment))
526
+ for i in range(n):
527
+ seg = diarization_segments[i]
528
+ text = (transcriptions[i] or "").strip()
529
+ spk = speakers_per_segment[i]
530
+ items.append(
531
+ {
532
+ "start": float(seg.get("start", 0.0)),
533
+ "end": float(seg.get("end", 0.0)),
534
+ "text": text,
535
+ "speaker": spk,
536
+ }
537
+ )
538
+
539
+ out = Path(output_srt_path)
540
+ out.parent.mkdir(parents=True, exist_ok=True)
541
+ with out.open("w", encoding="utf-8-sig") as f:
542
+ for i, it in enumerate(items, 1):
543
+ text = it["text"]
544
+ spk = it["speaker"]
545
+ if speaker_display == "brackets" and spk:
546
+ text = f"[{spk}]: {text}" # Adjusted format to match new script's style
547
+ elif speaker_display == "prefix" and spk:
548
+ text = f"{spk}: {text}"
549
+
550
+ # wrap simple
551
+ words = text.split()
552
+ lines: List[str] = []
553
+ cur = ""
554
+ for w in words:
555
+ if len(cur) + len(w) + (1 if cur else 0) <= max_cpl:
556
+ cur = (cur + " " + w) if cur else w
557
+ else:
558
+ lines.append(cur)
559
+ cur = w
560
+ if len(lines) >= max_lines - 1:
561
+ break
562
+ if cur and len(lines) < max_lines:
563
+ lines.append(cur)
564
+ f.write(f"{i}\n{_fmt_srt_time(it['start'])} --> {_fmt_srt_time(it['end'])}\n")
565
+ f.write("\n".join(lines) + "\n\n")
566
+
567
+ # ------------------------------ Orchestrator ---------------------------------
568
+
569
+ def process_audio_for_video(
570
+ video_path: str,
571
+ out_dir: Path,
572
+ cfg: Dict[str, Any],
573
+ voice_collection=None,
574
+ ) -> Tuple[List[Dict[str, Any]], Optional[str]]:
575
+ """
576
+ Audio pipeline: FFmpeg -> diarization -> LID -> ASR -> embeddings -> speaker-ID -> SRT.
577
+ Returns (audio_segments, srt_path or None).
578
+ """
579
+ # 1) Audio extraction
580
+ audio_cfg = cfg.get("audio_processing", {})
581
+ sr = int(audio_cfg.get("sample_rate", 16000))
582
+ fmt = audio_cfg.get("format", "wav")
583
+ wav_path = extract_audio_ffmpeg(
584
+ video_path, out_dir / f"{Path(video_path).stem}.{fmt}", sr=sr
585
+ )
586
+ log.info("Audio extraído")
587
+
588
+ # 2) Diarización
589
+ diar_cfg = audio_cfg.get("diarization", {})
590
+ min_dur = float(diar_cfg.get("min_segment_duration", 0.5))
591
+ max_dur = float(diar_cfg.get("max_segment_duration", 10.0))
592
+ clip_paths, diar_segs = diarize_audio(
593
+ wav_path, out_dir, "clips", min_dur, max_dur
594
+ )
595
+ log.info("Clips de audio generados.")
596
+
597
+ # 3) Detección de idioma (opcional) + Selección de backend ASR
598
+ asr_cfg = cfg.get("asr", {})
599
+ lid_enabled = bool(asr_cfg.get("language_detection", {}).get("enabled", True))
600
+
601
+ device_pref = _pick_device_auto(asr_cfg.get("device", "auto"))
602
+
603
+ aina_asr = AinaASR(model_name=asr_cfg.get("model_name", "projecte-aina/whisper-large-v3-ca-3catparla"),
604
+ device=device_pref)
605
+
606
+ whisper_asr = WhisperASR(model_name=asr_cfg.get("whisper_model_name", "openai/whisper-small"),
607
+ device=device_pref,
608
+ language=None)
609
+
610
+ full_transcription = ""
611
+ if asr_cfg.get("enable_full_transcription", True):
612
+ log.info("Iniciando transcripción del audio completo")
613
+ # Assume Catalan model for full transcription, or add logic to check language
614
+ full_transcription = aina_asr.transcribe_long_audio(wav_path, chunk_length_s=30)
615
+ log.info("Transcripción completa del audio finalizada.")
616
+ print(full_transcription)
617
+
618
+ # Transcribe each segment
619
+ log.info("Comenzamos con la transcripción de cada clip.")
620
+ trans: List[str] = []
621
+ detected_langs: List[str] = []
622
+ detected_probs: List[float] = []
623
+ for path in clip_paths:
624
+ if not lid_enabled:
625
+ txt = aina_asr.transcribe_wav(path)
626
+ else:
627
+ detected_lang, detected_prob = detect_language_with_whisper(path, cfg)
628
+ log.info(f"LID: detected={detected_lang} (p={detected_prob:.2f})")
629
+
630
+ if detected_lang.lower() in ["ca", "catalan"]:
631
+ txt = aina_asr.transcribe_wav(path)
632
+ else:
633
+ txt = whisper_asr.transcribe_wav(path)
634
+ trans.append(txt)
635
+
636
+ log.info("Se han transcrito todos los clips.")
637
+
638
+ # 5) Embeddings + Identificación de hablantes
639
+ if audio_cfg.get("enable_voice_embeddings", True):
640
+ embeddings = embed_voice_segments(clip_paths)
641
+ log.info("Embeddings creados de manera correcta para cada clip.")
642
+ else:
643
+ embeddings = [[] for _ in clip_paths]
644
+
645
+ if cfg.get("voice_processing", {}).get("speaker_identification", {}).get("enabled", True):
646
+ speakers = identify_speakers(embeddings, voice_collection, cfg)
647
+ log.info("Speakers identificados de manera correcta.")
648
+ else:
649
+ speakers = [seg.get("speaker", f"SPEAKER_{i:02d}") for i, seg in enumerate(diar_segs)]
650
+
651
+ # 6) Construir tabla de segmentos
652
+ audio_segments: List[Dict[str, Any]] = []
653
+ for i, seg in enumerate(diar_segs):
654
+ audio_segments.append(
655
+ {
656
+ "segment": i,
657
+ "start": float(seg.get("start", 0.0)),
658
+ "end": float(seg.get("end", 0.0)),
659
+ "speaker": speakers[i] if i < len(speakers) else seg.get("speaker", f"SPEAKER_{i:02d}"),
660
+ "text": trans[i] if i < len(trans) else "",
661
+ "voice_embedding": embeddings[i],
662
+ "clip_path": str(out_dir / "clips" / f"segment_{i:03d}.wav"),
663
+ "lang": detected_langs[i] if i < len(detected_langs) else "auto",
664
+ "lang_prob": detected_probs[i] if i < len(detected_probs) else 0.0,
665
+ }
666
+ )
667
+
668
+ # 7) SRT
669
+ srt_base_path = out_dir / f"transcripcion_diarizada_{Path(video_path).stem}"
670
+ srt_unmodified_path = str(srt_base_path) + "_unmodified.srt"
671
+
672
+ # Generate initial SRT
673
+ try:
674
+ generate_srt_from_diarization(
675
+ diar_segs,
676
+ [a["text"] for a in audio_segments],
677
+ [a["speaker"] for a in audio_segments],
678
+ srt_unmodified_path,
679
+ cfg,
680
+ )
681
+
682
+ except Exception as e:
683
+ log.warning(f"SRT generation failed: {e}")
684
+ srt_unmodified_path = None
685
+
686
+ return audio_segments, srt_unmodified_path, full_transcription
687
+
688
+ # ----------------------------------- CLI -------------------------------------
689
+ if __name__ == "__main__":
690
+ import argparse
691
+ import yaml
692
+
693
+ ap = argparse.ArgumentParser(description="Veureu — Audio tools (self-contained)")
694
+ ap.add_argument("--video", required=True)
695
+ ap.add_argument("--out", default="results")
696
+ ap.add_argument("--config", default="configs/config_veureu.yaml")
697
+ args = ap.parse_args()
698
+
699
+ cfg: Dict[str, Any] = {}
700
+ p = Path(args.config)
701
+ if p.exists():
702
+ try:
703
+ cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
704
+ except Exception as e:
705
+ log.warning(f"No se pudo leer el YAML de config: {e}")
706
+
707
+ out_dir = Path(args.out) / Path(args.video).stem
708
+ out_dir.mkdir(parents=True, exist_ok=True)
709
+
710
+ # Aggiungi una chiave API di OpenAI al tuo file di configurazione o qui
711
+ # Esempio: cfg["api_keys"] = {"openai": "sk-your-openai-api-key"}
712
+ # Assicurati di non commettere la chiave in git!
713
+
714
+ segs, srt = process_audio_for_video(args.video, out_dir, cfg, voice_collection=None)
715
+
716
+ print(json.dumps(
717
+ {
718
+ "segments": len(segs),
719
+ "srt": srt,
720
+ "detected_lang": (segs[0].get("lang") if segs else "auto"),
721
+ "detected_prob": (segs[0].get("lang_prob") if segs else 0.0),
722
+ },
723
+ indent=2,
724
+ ensure_ascii=False,
725
+ ))
background_descriptor.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================================
2
+ # PATCH: background_descriptor.py (describe_keyframes_with_llm)
3
+ # ================================
4
+ # Sustituimos la llamada directa a describe_montage_sequence por router.vision_describe
5
+ # y mantenemos como fallback la función existente.
6
+
7
+ # (reemplaza la función en este archivo por esta versión)
8
+
9
+ def describe_keyframes_with_llm(
10
+ keyframes: List[Dict[str, Any]],
11
+ out_dir: Path,
12
+ face_identities: Optional[set] = None,
13
+ config_path: str | None = None,
14
+ ) -> Tuple[List[Dict[str, Any]], Optional[str]]:
15
+ from llm_router import load_yaml, LLMRouter
16
+ cfg = load_yaml(config_path or "config.yaml")
17
+ model_name = (cfg.get("background_descriptor", {}).get("description", {}) or {}).get("model", "salamandra-vision")
18
+
19
+ frame_paths = [k.get("image_path") for k in keyframes if k.get("image_path")]
20
+ montage_dir = out_dir / "montage"
21
+ montage_path = None
22
+ if frame_paths:
23
+ montage_path = generar_montage(frame_paths, montage_dir)
24
+ context = {
25
+ "informacion": [{k: v for k, v in fr.items() if k in ("start", "end", "ocr", "faces")} for fr in keyframes],
26
+ "face_identities": sorted(list(face_identities or set()))
27
+ }
28
+ try:
29
+ router = LLMRouter(cfg)
30
+ descs = router.vision_describe(frame_paths, context=context, model=model_name)
31
+ except Exception:
32
+ # Fallback a implementación local existente si falla el remoto
33
+ descs = describe_montage_sequence(
34
+ montage_path=str(montage_path),
35
+ n=len(frame_paths),
36
+ informacion=keyframes,
37
+ face_identities=face_identities or set(),
38
+ config_path=config_path or "config_veureu.yaml",
39
+ )
40
+ for i, fr in enumerate(keyframes):
41
+ if i < len(descs):
42
+ fr["description"] = descs[i]
43
+ return keyframes, str(montage_path) if montage_path else None
casting_loader.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # identity_encoding.py (updated to use libs/*)
2
+ # Veureu — Identity Encoder (faces, voices, scenarios)
3
+ # -----------------------------------------------------------------------------
4
+ # This script replaces the original `identity_encoding.py` but **reuses**
5
+ # as much as possible the functions already present in `libs/`.
6
+ # It respects the project's path structure (identities/*, scenarios, chroma_db,
7
+ # results) and maintains the classic pipeline:
8
+ # 1) index_faces (ChromaDB)
9
+ # 2) identity_features.csv
10
+ # 3) index_voices (ChromaDB)
11
+ # 4) scenarios_descriptions.csv
12
+ # 5) index_scenarios (ChromaDB)
13
+ # -----------------------------------------------------------------------------
14
+ from __future__ import annotations
15
+ import argparse
16
+ import csv
17
+ import logging
18
+ import sys
19
+ import uuid
20
+ from dataclasses import dataclass
21
+ from pathlib import Path
22
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
23
+
24
+ # ============================ LOGGING ========================================
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
+ log = logging.getLogger("identity_encoding")
27
+
28
+ # ============================ DEPENDENCIES ===================================
29
+ # ChromaDB (persistente)
30
+ try:
31
+ import chromadb
32
+ from chromadb.config import Settings # noqa: F401
33
+ except Exception as e:
34
+ chromadb = None # type: ignore
35
+ log.error("No se pudo importar chromadb: %s", e)
36
+
37
+ from libs.vision_tools_salamandra import FaceAnalyzer
38
+ from collections import Counter
39
+
40
+ # Audio: reuse get_embedding from the existing pipeline
41
+ from libs.audio_tools_ana_2 import VoiceEmbedder
42
+ from libs.vision_tools_salamandra import FaceOfImageEmbedding
43
+
44
+ # Optional
45
+ try:
46
+ import numpy as np
47
+ except Exception:
48
+ np = None # type: ignore
49
+
50
+ # ============================ UTILITIES =====================================
51
+ IMG_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
52
+ AUD_EXT = {".wav", ".mp3", ".flac", ".m4a", ".ogg"}
53
+
54
+
55
+ def list_files(root: Path, exts: Iterable[str]) -> List[Path]:
56
+ root = Path(root)
57
+ if not root.exists():
58
+ return []
59
+ return [p for p in root.rglob('*') if p.suffix.lower() in exts]
60
+
61
+
62
+ def ensure_chroma(db_dir: Path):
63
+ if chromadb is None:
64
+ raise RuntimeError("chromadb no instalado. pip install chromadb")
65
+ db_dir.mkdir(parents=True, exist_ok=True)
66
+
67
+ # Nueva forma de crear un cliente persistente
68
+ client = chromadb.Client(Settings(
69
+ chroma_db_impl="duckdb+parquet",
70
+ persist_directory=str(db_dir)
71
+ ))
72
+ return client
73
+
74
+ # ============================ 1) INDEX FACES =================================
75
+ def build_faces_index(faces_dir: Path, client, collection_name: str = "index_faces",
76
+ deepface_model: str = 'Facenet512', drop: bool = True) -> int:
77
+ # idempotency
78
+ if collection_name in [c.name for c in client.list_collections()] and drop:
79
+ client.delete_collection(name=collection_name)
80
+ col = client.get_or_create_collection(name=collection_name)
81
+
82
+ be = FaceOfImageEmbedding(deepface_model=deepface_model)
83
+ count = 0
84
+ registered_identities = set() # 👈 para no repetir nombres
85
+
86
+ for ident_dir in sorted(Path(faces_dir).iterdir() if Path(faces_dir).exists() else []):
87
+ if not ident_dir.is_dir():
88
+ continue
89
+ ident = ident_dir.name
90
+ for img_path in list_files(ident_dir, IMG_EXT):
91
+ embeddings = be.encode_image(img_path)
92
+ if embeddings is None:
93
+ log.warning("No face embedding in %s", img_path)
94
+ continue
95
+
96
+ # Aplanar para que cada embedding sea una lista de floats
97
+ for e in (embeddings if isinstance(embeddings[0], list) else [embeddings]):
98
+ uid = str(uuid.uuid4())
99
+ col.add(ids=[uid], embeddings=[e], metadatas=[{"identity": ident, "path": str(img_path)}])
100
+ count += 1
101
+ registered_identities.add(ident) # 👈 guardamos el nombre
102
+
103
+ # Mensajes finales
104
+ print("Ha acabado de crear la base de datos.")
105
+ print(f"Total de embeddings guardados: {count}")
106
+ print("Identidades registradas:")
107
+ for name in sorted(registered_identities):
108
+ print(f" - {name}")
109
+
110
+ log.info("index_faces => %d embeddings", count)
111
+ return count
112
+
113
+ # ===================== 2) IDENTITY FEATURES CSV ==============================
114
+
115
+ def aggregate_face_attributes(faces_dir: Path, out_csv: Path) -> int:
116
+ """
117
+ Procesa un directorio de caras por identidad y genera un CSV con edad y género.
118
+ Usa FaceAnalyzer para extraer atributos.
119
+ """
120
+ # Inicializa el analizador
121
+ from libs.vision_tools_salamandra import FaceAnalyzer
122
+ analyzer = FaceAnalyzer()
123
+
124
+ rows: List[Dict[str, Any]] = []
125
+
126
+ faces_dir = Path(faces_dir)
127
+ if not faces_dir.exists() or not faces_dir.is_dir():
128
+ log.error("El directorio de caras no existe: %s", faces_dir)
129
+ return 0
130
+
131
+ def most_common(lst, default="unknown"):
132
+ return Counter(lst).most_common(1)[0][0] if lst else default
133
+
134
+ # Itera sobre cada identidad
135
+ for ident_dir in sorted(faces_dir.iterdir()):
136
+ if not ident_dir.is_dir():
137
+ continue
138
+ ident = ident_dir.name
139
+ attrs: List[Dict[str, Any]] = []
140
+
141
+ log.info("Procesando identidad: %s", ident)
142
+
143
+ for img_path in sorted(list_files(ident_dir, IMG_EXT)):
144
+ try:
145
+ data = analyzer.analyze_image(str(img_path))
146
+ if data:
147
+ attrs.append(data)
148
+ except Exception as e:
149
+ log.warning("Error procesando imagen %s: %s", img_path, e)
150
+
151
+ genders = [a.get("gender", "unknown") for a in attrs]
152
+ ages = [a.get("age", "unknown") for a in attrs]
153
+
154
+ # Contexto opcional por identidad
155
+ context_txt = (faces_dir.parent / "context" / f"{ident}.txt")
156
+ identity_context = context_txt.read_text(encoding="utf-8").strip() if context_txt.exists() else ""
157
+
158
+ rows.append({
159
+ "identity": ident,
160
+ "samples": len(attrs),
161
+ "gender": most_common(genders),
162
+ "age_bucket": most_common(ages),
163
+ "identity_context": identity_context,
164
+ })
165
+
166
+ log.info("Procesados %d atributos para %s", len(attrs), ident)
167
+
168
+ # Guardar CSV
169
+ out_csv.parent.mkdir(parents=True, exist_ok=True)
170
+ with out_csv.open("w", newline='', encoding="utf-8") as f:
171
+ fieldnames = list(rows[0].keys()) if rows else ["identity", "identity_context"]
172
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
173
+ writer.writeheader()
174
+ writer.writerows(rows)
175
+
176
+ log.info("CSV generado correctamente: %s", out_csv)
177
+ return len(rows)
178
+
179
+ # ============================ 3) INDEX VOICES =================================
180
+ from pydub import AudioSegment # agregar al inicio de tu archivo junto a otros imports
181
+
182
+ def build_voices_index(voices_dir: Path, client, collection_name: str = "index_voices", drop: bool = True) -> int:
183
+ if collection_name in [c.name for c in client.list_collections()] and drop:
184
+ client.delete_collection(name=collection_name)
185
+ col = client.get_or_create_collection(name=collection_name)
186
+
187
+ ve = VoiceEmbedder()
188
+ count = 0
189
+
190
+ for ident_dir in sorted(Path(voices_dir).iterdir() if Path(voices_dir).exists() else []):
191
+ if not ident_dir.is_dir():
192
+ continue
193
+ ident = ident_dir.name
194
+ for wav_path in list_files(ident_dir, AUD_EXT):
195
+ # Intentar embed directamente
196
+ try:
197
+ emb = ve.embed(wav_path)
198
+ except Exception as e:
199
+ log.warning("Error leyendo audio %s: %s. Intentando reconvertir...", wav_path, e)
200
+ # Reconversión automática a WAV PCM
201
+ try:
202
+ audio = AudioSegment.from_file(wav_path)
203
+ fixed_path = wav_path.with_name(wav_path.stem + "_fixed.wav")
204
+ audio.export(fixed_path, format="wav")
205
+ log.info("Archivo convertido a WAV compatible: %s", fixed_path)
206
+ emb = ve.embed(fixed_path)
207
+ except Exception as e2:
208
+ log.error("No se pudo generar embedding tras reconversión para %s: %s", wav_path, e2)
209
+ continue # saltar este archivo
210
+ if emb is None:
211
+ log.warning("No voice embedding en %s", wav_path)
212
+ continue
213
+ uid = str(uuid.uuid4())
214
+ col.add(ids=[uid], embeddings=[emb], metadatas=[{"identity": ident, "path": str(wav_path)}])
215
+ count += 1
216
+
217
+ log.info("index_voices => %d embeddings", count)
218
+ return count
219
+
220
+ # ============================ 4) SCENARIOS ==================================
221
+ @dataclass
222
+ class VisionClient:
223
+ provider: str = "none" # placeholder to plug in an LLM if desired
224
+
225
+ def describe(self, image_path: str, prompt: str) -> str:
226
+ return (f"Automatic description (placeholder) for {Path(image_path).name}. "
227
+ f"{prompt}")
228
+
229
+
230
+ class TextEmbedder:
231
+ """Text embeddings with Sentence-Transformers if available; fallback to TF-IDF."""
232
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
233
+ self.kind = "tfidf"; self.model = None; self.vectorizer = None
234
+ try:
235
+ from sentence_transformers import SentenceTransformer
236
+ self.model = SentenceTransformer(model_name)
237
+ self.kind = "sbert"
238
+ except Exception:
239
+ from sklearn.feature_extraction.text import TfidfVectorizer
240
+ self.vectorizer = TfidfVectorizer(max_features=768)
241
+
242
+ def fit(self, texts: List[str]):
243
+ if self.vectorizer is not None:
244
+ self.vectorizer.fit(texts)
245
+
246
+ def encode(self, texts: List[str]) -> List[List[float]]:
247
+ if self.model is not None:
248
+ arr = self.model.encode(texts, convert_to_numpy=True)
249
+ return arr.astype(float).tolist()
250
+ X = self.vectorizer.transform(texts) if self.vectorizer is not None else None
251
+ return (X.toarray().astype(float).tolist() if X is not None else [[0.0]*128 for _ in texts])
252
+
253
+
254
+ def build_scenarios_descriptions(scenarios_dir: Path, out_csv: Path, vision: VisionClient,
255
+ sample_per_scenario: int = 12) -> Tuple[int, List[Dict[str, Any]]]:
256
+ rows: List[Dict[str, Any]] = []
257
+ for scen_dir in sorted(Path(scenarios_dir).iterdir() if Path(scenarios_dir).exists() else []):
258
+ if not scen_dir.is_dir():
259
+ continue
260
+ scen = scen_dir.name
261
+ descs: List[str] = []
262
+ imgs = list_files(scen_dir, IMG_EXT)[:sample_per_scenario]
263
+ for img in imgs:
264
+ d = vision.describe(str(img), prompt="Describe location, time period, lighting, and atmosphere without mentioning people or time of day.")
265
+ if d:
266
+ descs.append(d)
267
+ if not descs:
268
+ descs = [f"Scenario {scen} (no images)"]
269
+ rows.append({"scenario": scen, "descriptions": " \n".join(descs)})
270
+
271
+ out_csv.parent.mkdir(parents=True, exist_ok=True)
272
+ with out_csv.open("w", newline='', encoding="utf-8") as f:
273
+ w = csv.DictWriter(f, fieldnames=["scenario", "descriptions"])
274
+ w.writeheader(); w.writerows(rows)
275
+ log.info("scenarios_descriptions => %s", out_csv)
276
+ return len(rows), rows
277
+
278
+
279
+ def build_scenarios_index(client, rows: List[Dict[str, Any]], embedder: TextEmbedder,
280
+ collection_name: str = "index_scenarios", drop: bool = True) -> int:
281
+ texts = [r["descriptions"] for r in rows]
282
+ embedder.fit(texts)
283
+ embs = embedder.encode(texts)
284
+
285
+ if collection_name in [c.name for c in client.list_collections()] and drop:
286
+ client.delete_collection(name=collection_name)
287
+ col = client.get_or_create_collection(name=collection_name)
288
+
289
+ for r, e in zip(rows, embs):
290
+ col.add(ids=[r["scenario"]], embeddings=[e], metadatas=[{"scenario": r["scenario"]}])
291
+ log.info("index_scenarios => %d descriptions", len(rows))
292
+ return len(rows)
293
+
294
+ # ================================ CLI ========================================
295
+
296
+ def main():
297
+ ap = argparse.ArgumentParser(description="Veureu — Build identity/scenario indices and CSVs")
298
+ ap.add_argument('--faces_dir', default='identities/faces', help='Root directory of face images per identity')
299
+ ap.add_argument('--voices_dir', default='identities/voices', help='Root directory of voice clips per identity')
300
+ ap.add_argument('--scenarios_dir', default='scenarios', help='Root directory of scenario folders with images')
301
+ ap.add_argument('--db_dir', default='chroma_db', help='ChromaDB persistence directory')
302
+ ap.add_argument('--out_dir', default='results', help='Output directory for CSVs')
303
+ ap.add_argument('--drop_collections', action='store_true', help='Delete collections if they exist before rebuilding')
304
+ ap.add_argument('--deepface_model', default='Facenet512', help='DeepFace model to use as fallback')
305
+ ap.add_argument('--scenario_samples', type=int, default=12, help='Number of images per scenario to describe')
306
+
307
+ args = ap.parse_args()
308
+
309
+ faces_dir = Path(args.faces_dir)
310
+ voices_dir = Path(args.voices_dir)
311
+ print(voices_dir)
312
+ scenarios_dir = Path(args.scenarios_dir)
313
+ out_dir = Path(args.out_dir); out_dir.mkdir(parents=True, exist_ok=True)
314
+
315
+ client = ensure_chroma(Path(args.db_dir))
316
+
317
+ # 1) Faces index
318
+ build_faces_index(faces_dir, client, collection_name="index_faces", deepface_model=args.deepface_model, drop=args.drop_collections)
319
+
320
+ # 2) Identity features CSV
321
+ #id_csv = out_dir / 'identity_features.csv'
322
+ #aggregate_face_attributes(faces_dir, id_csv)
323
+
324
+ # 3) Voices index
325
+ build_voices_index(voices_dir, client, collection_name="index_voices", drop=args.drop_collections)
326
+
327
+ # 4) Scenarios descriptions
328
+ #vision = VisionClient()
329
+ #scen_csv = out_dir / 'scenarios_descriptions.csv'
330
+ #_, scen_rows = build_scenarios_descriptions(scenarios_dir, scen_csv, vision, sample_per_scenario=args.scenario_samples)
331
+
332
+ # 5) Scenarios index
333
+ #embedder = TextEmbedder()
334
+ #build_scenarios_index(client, scen_rows, embedder, collection_name="index_scenarios", drop=args.drop_collections)
335
+
336
+ log.info("✅ Identity encoding completed.")
337
+
338
+
339
+ if __name__ == '__main__' and '--video' not in sys.argv:
340
+ main()
config.yaml ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===========================
2
+ # Veureu Engine – config.yaml
3
+ # ===========================
4
+
5
+ engine:
6
+ # Salida de artefactos
7
+ output_root: "results"
8
+ # Persistencia de índices vectoriales
9
+ database:
10
+ enabled: true
11
+ persist_directory: "chroma_db"
12
+ enable_face_recognition: true
13
+ enable_voice_recognition: true
14
+ face_collection: "index_faces"
15
+ voice_collection: "index_voices"
16
+
17
+ # Jobs asíncronos (si implementas el patrón de cola)
18
+ jobs:
19
+ enabled: true
20
+ max_workers: 1 # Ajusta según recursos del Space
21
+ result_ttl_seconds: 86400 # 1 día
22
+
23
+ api:
24
+ cors_allow_origins: ["*"]
25
+ # Tiempo máximo (segundos) de una petición síncrona (si usas el endpoint sync)
26
+ sync_timeout_seconds: 3600
27
+
28
+ video_processing:
29
+ # Metadatos de extracción de frames
30
+ keyframes:
31
+ # Si tu extractor condicional usa umbrales, puedes incluirlos aquí:
32
+ conditional_extraction:
33
+ enable: true
34
+ # ejemplos de parámetros (ajústalos a tu extractor real)
35
+ min_scene_length_seconds: 1.5
36
+ difference_threshold: 28.0
37
+
38
+ frames_per_second:
39
+ enable: true
40
+ fps: 1.0 # "frecuencia de frames de análisis" (1 frame/segundo por defecto)
41
+
42
+ ocr:
43
+ engine: "tesseract" # o "easyocr"
44
+ language_hint: "spa" # si aplica
45
+ # Solo si usas pytesseract:
46
+ tesseract_cmd: "" # ruta binaria si no está en PATH
47
+
48
+ faces:
49
+ detector_model: "mtcnn" # ejemplar; ajústalo a tu vision_tools
50
+ embedding_model: "Facenet512" # usado en background_descriptor.FaceOfImageEmbedding
51
+ min_face_size: 32
52
+ detection_confidence: 0.85
53
+
54
+ ocr_clustering:
55
+ method: "sequential_similarity"
56
+ sentence_transformer: "all-MiniLM-L6-v2"
57
+ similarity_threshold: 0.60 # "número de clusters" implícito por umbral (más alto ⇒ menos clusters)
58
+
59
+ audio_processing:
60
+ # El ASR principal será remoto si seleccionas whisper catalan (ver models/routing)
61
+ diarization:
62
+ enabled: true
63
+ # ejemplo de parámetros de diarización si los usas en audio_tools
64
+ min_speaker_duration: 0.8
65
+ max_speakers: 8
66
+
67
+ speaker_embedding:
68
+ enabled: true
69
+ # umbral para asig. de identidad en voice_collection
70
+ speaker_identification:
71
+ distance_threshold: 0.40
72
+
73
+ # Si mantienes transcripción local para otros idiomas/modelos:
74
+ local_asr:
75
+ enabled: false # usarás remoto para whisper catalan
76
+ model: "" # (vacío porque lo gestionas vía Spaces)
77
+
78
+ background_descriptor:
79
+ # Parámetros del montaje y descripción con LLM
80
+ montage:
81
+ enable: true
82
+ max_frames: 12 # tope de frames en el collage/descripcion
83
+ grid: "auto" # o 3x4, etc.
84
+
85
+ description:
86
+ model: "salamandra-vision" # puede ser "salamandra-vision" o "gpt-4o-mini"
87
+ max_tokens: 512
88
+ temperature: 0.2
89
+ # Si hay identidades detectadas, se pasan como hints (ya lo hace tu pipeline)
90
+
91
+ identity:
92
+ # Reglas de mapeo temporal y enriquecimiento
93
+ timeline_mapping:
94
+ per_second_frames_source: "frames_per_second"
95
+ attach_faces_to:
96
+ - "keyframes"
97
+ - "audio_segments"
98
+ out_key: "persona"
99
+
100
+ narration:
101
+ model: "salamandra-instruct" # "salamandra-instruct" | "gpt-4o-mini"
102
+ une_guidelines_path: "UNE_153010.txt"
103
+ # Restricciones temporales (para UNE-153010)
104
+ timing:
105
+ max_ad_duration_ratio: 0.60 # proporción del hueco disponible que puede ocupar la AD
106
+ min_gap_seconds: 1.20
107
+ min_ad_seconds: 0.80
108
+ llm:
109
+ max_tokens: 1024
110
+ temperature: 0.2
111
+
112
+ models:
113
+ # Selección de modelos de alto nivel por tarea
114
+ instruct: "salamandra-instruct" # para NarrationSystem y otros textos
115
+ vision: "salamandra-vision" # para describir frames/montajes
116
+ tools: "salamandra-tools" # si necesitas funciones con tool-calling
117
+ asr: "whisper-catalan" # ASR catalán
118
+
119
+ # Enrutado: qué modelos se ejecutan REMOTO (vía otros Spaces)
120
+ routing:
121
+ use_remote_for:
122
+ - "salamandra-instruct"
123
+ - "salamandra-vision"
124
+ - "salamandra-tools"
125
+ - "whisper-catalan"
126
+
127
+ remote_spaces:
128
+ # Dónde llamar cuando models.routing decide “remoto”
129
+ user: "veureu"
130
+
131
+ endpoints:
132
+ # Nota: rellena las URLs reales cuando publiques los Spaces.
133
+ salamandra-instruct:
134
+ space: "schat"
135
+ base_url: "https://veureu-schat.hf.space"
136
+ client: "gradio" # "gradio" o "http"
137
+ predict_route: "/run/predict" # si usas gradio_client no necesitas ruta
138
+
139
+ salamandra-vision:
140
+ space: "svision"
141
+ base_url: "https://veureu-svision.hf.space"
142
+ client: "gradio"
143
+ predict_route: "/run/predict"
144
+
145
+ salamandra-tools:
146
+ space: "stools"
147
+ base_url: "https://veureu-stools.hf.space"
148
+ client: "gradio"
149
+ predict_route: "/run/predict"
150
+
151
+ whisper-catalan:
152
+ space: "ars"
153
+ base_url: "https://veureu-ars.hf.space"
154
+ client: "gradio"
155
+ predict_route: "/run/predict"
156
+
157
+ # Parámetros de red y robustez
158
+ http:
159
+ timeout_seconds: 120
160
+ retries: 3
161
+ backoff_seconds: 2.0
162
+
163
+ security:
164
+ # Si necesitas pasar tokens (p. ej., tokens del Hub o auth propia)
165
+ use_hf_token: true
166
+ hf_token_env: "HF_TOKEN" # nombre de la variable de entorno para el token
167
+ allow_insecure_tls: false
168
+
169
+ logging:
170
+ level: "INFO" # DEBUG | INFO | WARNING | ERROR
171
+ json: false
identity_manager.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =========================
2
+ # File: identity_manager.py
3
+ # =========================
4
+ from __future__ import annotations
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+ from chromadb.api.types import Collection
8
+
9
+
10
+ class IdentityManager:
11
+ """
12
+ Encapsula toda la lógica de asignación de identidades (caras + voces)
13
+ y su proyección sobre frames, clips y SRT.
14
+ """
15
+
16
+ def __init__(self, face_collection: Optional[Collection] = None, voice_collection: Optional[Collection] = None):
17
+ self.face_collection = face_collection
18
+ self.voice_collection = voice_collection
19
+
20
+ # --------------------------- Faces / Frames ---------------------------
21
+ def assign_faces_to_frames(
22
+ self,
23
+ frames: List[Dict[str, Any]],
24
+ ) -> List[Dict[str, Any]]:
25
+ """
26
+ `frames` es una lista de dicts con al menos: {image_path, start, end, faces:[{embedding?, bbox?}]}
27
+ Devuelve los mismos frames con `faces` enriquecidos con `identity` y `distance` si hay DB.
28
+ """
29
+ if self.face_collection is None:
30
+ return frames
31
+
32
+ out = []
33
+ for fr in frames:
34
+ faces = fr.get("faces") or []
35
+ enr: List[Dict[str, Any]] = []
36
+ for f in faces:
37
+ emb = f.get("embedding") or f.get("vector")
38
+ if not emb:
39
+ enr.append(f)
40
+ continue
41
+ try:
42
+ q = self.face_collection.query(query_embeddings=[emb], n_results=1, include=["metadatas", "distances"]) # type: ignore
43
+ metas = q.get("metadatas", [[]])[0]
44
+ dists = q.get("distances", [[]])[0]
45
+ if metas:
46
+ md = metas[0] or {}
47
+ f = dict(f)
48
+ f["identity"] = md.get("identity") or md.get("name")
49
+ if dists:
50
+ f["distance"] = float(dists[0])
51
+ except Exception:
52
+ pass
53
+ enr.append(f)
54
+ fr2 = dict(fr)
55
+ fr2["faces"] = enr
56
+ out.append(fr2)
57
+ return out
58
+
59
+ # --------------------------- Voices / Segments ------------------------
60
+ def assign_voices_to_segments(
61
+ self,
62
+ audio_segments: List[Dict[str, Any]],
63
+ distance_threshold: Optional[float] = None,
64
+ ) -> List[Dict[str, Any]]:
65
+ """
66
+ Añade `voice_vecinos` y `voice_identity` a cada segmento si hay colección de voz.
67
+ """
68
+ if self.voice_collection is None:
69
+ return audio_segments
70
+
71
+ out = []
72
+ for a in audio_segments:
73
+ emb = a.get("voice_embedding")
74
+ if not emb:
75
+ out.append(a)
76
+ continue
77
+ try:
78
+ q = self.voice_collection.query(query_embeddings=[emb], n_results=3, include=["metadatas", "distances"]) # type: ignore
79
+ metas = q.get("metadatas", [[]])[0]
80
+ dists = q.get("distances", [[]])[0]
81
+ vecinos = []
82
+ top_id = None
83
+ top_dist = None
84
+ for m, d in zip(metas, dists):
85
+ name = (m or {}).get("identity") or (m or {}).get("name")
86
+ vecinos.append({"identity": name, "distance": float(d)})
87
+ if top_id is None:
88
+ top_id, top_dist = name, float(d)
89
+ a2 = dict(a)
90
+ a2["voice_vecinos"] = vecinos
91
+ if top_id is not None:
92
+ if distance_threshold is None or (top_dist is not None and top_dist <= distance_threshold):
93
+ a2["voice_identity"] = top_id
94
+ out.append(a2)
95
+ except Exception:
96
+ out.append(a)
97
+ return out
98
+
99
+ # --------------------------- Map to SRT/Timelines ---------------------
100
+ @staticmethod
101
+ def map_identities_over_ranges(
102
+ per_second_frames: List[Dict[str, Any]],
103
+ ranges: List[Dict[str, Any]],
104
+ key: str = "faces",
105
+ out_key: str = "persona",
106
+ ) -> List[Dict[str, Any]]:
107
+ """
108
+ Para cada rango temporal (keyframes, audio_segments, etc.), agrega quién aparece según los frames por segundo.
109
+ """
110
+ out: List[Dict[str, Any]] = []
111
+ for rng in ranges:
112
+ s, e = float(rng.get("start", 0.0)), float(rng.get("end", 0.0))
113
+ present = []
114
+ for fr in per_second_frames:
115
+ fs, fe = float(fr.get("start", 0.0)), float(fr.get("end", 0.0))
116
+ if fe <= s or fs >= e:
117
+ continue
118
+ for f in fr.get(key) or []:
119
+ ident = f.get("identity")
120
+ if ident and ident not in present:
121
+ present.append(ident)
122
+ r2 = dict(rng)
123
+ r2[out_key] = present
124
+ out.append(r2)
125
+ return out
llm_router.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # ============================
3
+ # File: llm_router.py
4
+ # ============================
5
+ from __future__ import annotations
6
+ from typing import Any, Dict, List, Optional
7
+ from pathlib import Path
8
+ import yaml
9
+
10
+ from remote_clients import InstructClient, VisionClient, ToolsClient, ASRClient
11
+
12
+
13
+ def load_yaml(path: str) -> Dict[str, Any]:
14
+ p = Path(path)
15
+ if not p.exists():
16
+ return {}
17
+ return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
18
+
19
+
20
+ class LLMRouter:
21
+ def __init__(self, cfg: Dict[str, Any]):
22
+ self.cfg = cfg
23
+ self.rem = cfg.get("models", {}).get("routing", {}).get("use_remote_for", [])
24
+ base_user = cfg.get("remote_spaces", {}).get("user", "veureu")
25
+ eps = cfg.get("remote_spaces", {}).get("endpoints", {})
26
+ token_enabled = cfg.get("security", {}).get("use_hf_token", False)
27
+ hf_token = os.getenv(cfg.get("security", {}).get("hf_token_env", "HF_TOKEN")) if token_enabled else None
28
+
29
+ def mk(endpoint_key: str, cls):
30
+ info = eps.get(endpoint_key, {})
31
+ base_url = info.get("base_url") or f"https://{base_user}-{info.get('space')}.hf.space"
32
+ client = cls(base_url=base_url, use_gradio=(info.get("client", "gradio") == "gradio"), hf_token=hf_token,
33
+ timeout=int(cfg.get("remote_spaces", {}).get("http", {}).get("timeout_seconds", 120)))
34
+ return client
35
+
36
+ self.clients = {
37
+ "salamandra-instruct": mk("salamandra-instruct", InstructClient),
38
+ "salamandra-vision": mk("salamandra-vision", VisionClient),
39
+ "salamandra-tools": mk("salamandra-tools", ToolsClient),
40
+ "whisper-catalan": mk("whisper-catalan", ASRClient),
41
+ }
42
+
43
+ # ---- INSTRUCT ----
44
+ def instruct(self, prompt: str, system: Optional[str] = None, model: str = "salamandra-instruct", **kwargs) -> str:
45
+ if model in self.rem:
46
+ return self.clients[model].generate(prompt, system=system, **kwargs) # type: ignore
47
+ # fallback local (p. ej., gpt-4o-mini o gpt-oss vía tu API local si existiera)
48
+ # Aquí podrías integrar una API OpenAI-compatible si la tienes.
49
+ raise RuntimeError(f"Modelo local no implementado para: {model}")
50
+
51
+ # ---- VISION ----
52
+ def vision_describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, model: str = "salamandra-vision", **kwargs) -> List[str]:
53
+ if model in self.rem:
54
+ return self.clients[model].describe(image_paths, context=context, **kwargs) # type: ignore
55
+ raise RuntimeError(f"Modelo local no implementado para: {model}")
56
+
57
+ # ---- TOOLS ----
58
+ def chat_with_tools(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, model: str = "salamandra-tools", **kwargs) -> Dict[str, Any]:
59
+ if model in self.rem:
60
+ return self.clients[model].chat(messages, tools=tools, **kwargs) # type: ignore
61
+ raise RuntimeError(f"Modelo local no implementado para: {model}")
62
+
63
+ # ---- ASR ----
64
+ def asr_transcribe(self, audio_path: str, model: str = "whisper-catalan", **kwargs) -> Dict[str, Any]:
65
+ if model in self.rem:
66
+ return self.clients[model].transcribe(audio_path, **kwargs) # type: ignore
67
+ raise RuntimeError(f"Modelo local no implementado para: {model}")
main_api.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================
2
+ # PATCH: main_api.py (FastAPI) – refine_narration via remote instruct when configured
3
+ # ============================
4
+ from __future__ import annotations
5
+ from fastapi import FastAPI, UploadFile, File, Form
6
+ from fastapi.responses import JSONResponse
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pathlib import Path
9
+ import shutil
10
+ import uvicorn
11
+ import json
12
+
13
+ from video_processing import process_video_pipeline
14
+ from casting_loader import ensure_chroma, build_faces_index, build_voices_index
15
+ from narration_system import NarrationSystem
16
+ from llm_router import load_yaml, LLMRouter
17
+
18
+ app = FastAPI(title="Veureu Engine API", version="0.2.0")
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=["*"],
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ ROOT = Path("/tmp/veureu"); ROOT.mkdir(parents=True, exist_ok=True)
28
+
29
+
30
+ @app.get("/")
31
+ def root():
32
+ return {"ok": True, "service": "veureu-engine"}
33
+
34
+
35
+ @app.post("/process_video")
36
+ async def process_video(
37
+ video_file: UploadFile = File(...),
38
+ config_path: str = Form("config.yaml"),
39
+ out_root: str = Form("results"),
40
+ db_dir: str = Form("chroma_db"),
41
+ ):
42
+ tmp_video = ROOT / video_file.filename
43
+ with tmp_video.open("wb") as f:
44
+ shutil.copyfileobj(video_file.file, f)
45
+
46
+ result = process_video_pipeline(str(tmp_video), config_path=config_path, out_root=out_root, db_dir=db_dir)
47
+ return JSONResponse(result)
48
+
49
+
50
+ @app.post("/load_casting")
51
+ async def load_casting(
52
+ faces_dir: str = Form("identities/faces"),
53
+ voices_dir: str = Form("identities/voices"),
54
+ db_dir: str = Form("chroma_db"),
55
+ drop_collections: bool = Form(False),
56
+ ):
57
+ client = ensure_chroma(Path(db_dir))
58
+ n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
59
+ n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
60
+ return {"ok": True, "faces": n_faces, "voices": n_voices}
61
+
62
+
63
+ @app.post("/refine_narration")
64
+ async def refine_narration(
65
+ dialogues_srt: str = Form(...),
66
+ frame_descriptions_json: str = Form("[]"),
67
+ config_path: str = Form("config.yaml"),
68
+ ):
69
+ cfg = load_yaml(config_path)
70
+ frames = json.loads(frame_descriptions_json)
71
+
72
+ # Si el instruct está configurado como remoto, usamos el router; si no, caemos en NarrationSystem existente
73
+ model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
74
+ use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
75
+
76
+ if use_remote:
77
+ router = LLMRouter(cfg)
78
+ # Implementación simplificada de refinado usando el modelo instruct remoto.
79
+ # Mantén la lógica de prompts alineada con NarrationSystem si quieres 1:1.
80
+ system_msg = (
81
+ "Eres un sistema de audiodescripción que cumple UNE-153010. "
82
+ "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
83
+ "Devuelve JSON con {narrative_text, srt_text}."
84
+ )
85
+ prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
86
+ try:
87
+ txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
88
+ out = {}
89
+ try:
90
+ out = json.loads(txt)
91
+ except Exception:
92
+ out = {"narrative_text": txt, "srt_text": ""}
93
+ return {
94
+ "narrative_text": out.get("narrative_text", ""),
95
+ "srt_text": out.get("srt_text", ""),
96
+ "approved": True,
97
+ "critic_feedback": "",
98
+ }
99
+ except Exception as e:
100
+ # Fallback a NarrationSystem local si falla el remoto
101
+ ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
102
+ res = ns.run(dialogues_srt, frames)
103
+ return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
104
+
105
+ # Camino local (usa tu NarrationSystem actual)
106
+ ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
107
+ out = ns.run(dialogues_srt, frames)
108
+ return {
109
+ "narrative_text": out.narrative_text,
110
+ "srt_text": out.srt_text,
111
+ "approved": out.approved,
112
+ "critic_feedback": out.critic_feedback,
113
+ }
114
+
115
+
116
+ if __name__ == "__main__":
117
+ uvicorn.run(app, host="0.0.0.0", port=7860)
narration_system.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # narrator_agent.py
2
+ from __future__ import annotations
3
+ from typing import Dict, List, Any
4
+ from langgraph.graph import StateGraph, END
5
+ from langchain_openai import ChatOpenAI
6
+ from langchain_core.prompts import ChatPromptTemplate
7
+ from dataclasses import dataclass
8
+ import json
9
+ import time
10
+
11
+
12
+ @dataclass
13
+ class NarratorInput:
14
+ dialogues_srt: str
15
+ frame_descriptions: List[Dict[str, Any]] # [{"timestamp": "00:01:23,000", "description": "..."}]
16
+ une_guidelines_path: str
17
+ max_cycles: int = 3
18
+
19
+
20
+ @dataclass
21
+ class NarratorOutput:
22
+ narrative_text: str
23
+ srt_text: str
24
+ critic_feedback: str | None = None
25
+ approved: bool = False
26
+
27
+
28
+ class NarrationSystem:
29
+ """
30
+ LangGraph-based multi-agent system:
31
+ - NarratorNode: generates narration + SRT according to UNE-153010
32
+ - CriticNode: evaluates conformity with UNE and coherence
33
+ - IdentityManagerNode: adjusts character identification if needed
34
+ - BackgroundDescriptorNode: fixes background/scene coherence
35
+ """
36
+
37
+ def __init__(self, model_url: str, une_guidelines_path: str):
38
+ self.model_url = model_url
39
+ self.une_guidelines_path = une_guidelines_path
40
+
41
+ # LLM endpoints (each node could use a different deployment if desired)
42
+ self.narrator_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.6)
43
+ self.critic_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.3)
44
+ self.identity_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.4)
45
+ self.background_llm = ChatOpenAI(base_url=model_url, model="gpt-4o-mini", temperature=0.4)
46
+
47
+ with open(une_guidelines_path, "r", encoding="utf-8") as f:
48
+ self.une_rules = f.read()
49
+
50
+ # Build LangGraph workflow
51
+ self.graph = self.build_graph()
52
+
53
+ # -----------------------------------------------------------
54
+ # LangGraph nodes
55
+ # -----------------------------------------------------------
56
+
57
+ def narrator_node(self, state):
58
+ dialogues = state["dialogues_srt"]
59
+ frames = state["frame_descriptions"]
60
+
61
+ prompt = ChatPromptTemplate.from_template("""
62
+ Eres un narrador de audiodescripciones según la norma UNE-153010.
63
+ Combina coherentemente los diálogos del siguiente SRT con las descripciones de escena dadas.
64
+
65
+ Sigue estas pautas:
66
+ - Genera una narración libre que integre ambos tipos de información.
67
+ - Evita redundancias o descripciones triviales.
68
+ - Limita la duración de las audiodescripciones para que quepan entre los diálogos.
69
+ - Devuelve **dos bloques**:
70
+ 1️⃣ `NARRATION_TEXT`: narración libre completa en texto continuo.
71
+ 2️⃣ `UNE_SRT`: subtítulos con los diálogos y las audiodescripciones UNE.
72
+
73
+ ## DIÁLOGOS SRT
74
+ {dialogues}
75
+
76
+ ## DESCRIPCIONES DE FRAMES
77
+ {frames}
78
+ """)
79
+
80
+ response = self.narrator_llm.invoke(prompt.format(dialogues=dialogues, frames=json.dumps(frames, ensure_ascii=False)))
81
+ return {"narration": response.content, "critic_feedback": None, "approved": False}
82
+
83
+ def critic_node(self, state):
84
+ narration = state["narration"]
85
+ prompt = ChatPromptTemplate.from_template("""
86
+ Actúa como un revisor experto en audiodescripción conforme a la norma UNE-153010.
87
+ Evalúa el siguiente texto y SRT generados, detectando:
88
+ - Incoherencias en asignación de personajes.
89
+ - Errores en la identificación de escenarios.
90
+ - Desviaciones respecto a la norma UNE-153010.
91
+ - Incoherencias narrativas generales.
92
+
93
+ Devuelve:
94
+ - "APPROVED" si el resultado es conforme.
95
+ - En caso contrario, una lista JSON con observaciones clasificadas en:
96
+ - "characters"
97
+ - "scenes"
98
+ - "norma"
99
+ - "coherence"
100
+
101
+ ## NORMA UNE-153010
102
+ {une_rules}
103
+
104
+ ## TEXTO Y SRT A EVALUAR
105
+ {narration}
106
+ """)
107
+
108
+ response = self.critic_llm.invoke(prompt.format(une_rules=self.une_rules, narration=narration))
109
+ text = response.content.strip()
110
+
111
+ if "APPROVED" in text.upper():
112
+ return {"critic_feedback": None, "approved": True}
113
+ return {"critic_feedback": text, "approved": False}
114
+
115
+ def identity_node(self, state):
116
+ fb = state.get("critic_feedback", "")
117
+ narration = state["narration"]
118
+ prompt = ChatPromptTemplate.from_template("""
119
+ El siguiente feedback señala incoherencias en personajes o diálogos.
120
+ Corrige únicamente esos aspectos manteniendo el resto igual.
121
+
122
+ ## FEEDBACK
123
+ {fb}
124
+
125
+ ## TEXTO ORIGINAL
126
+ {narration}
127
+ """)
128
+ response = self.identity_llm.invoke(prompt.format(fb=fb, narration=narration))
129
+ return {"narration": response.content}
130
+
131
+ def background_node(self, state):
132
+ fb = state.get("critic_feedback", "")
133
+ narration = state["narration"]
134
+ prompt = ChatPromptTemplate.from_template("""
135
+ El siguiente feedback señala incoherencias en escenarios o contexto visual.
136
+ Ajusta las descripciones de fondo manteniendo el estilo y duración UNE.
137
+
138
+ ## FEEDBACK
139
+ {fb}
140
+
141
+ ## TEXTO ORIGINAL
142
+ {narration}
143
+ """)
144
+ response = self.background_llm.invoke(prompt.format(fb=fb, narration=narration))
145
+ return {"narration": response.content}
146
+
147
+ # -----------------------------------------------------------
148
+ # Graph assembly
149
+ # -----------------------------------------------------------
150
+
151
+ def build_graph(self):
152
+ g = StateGraph()
153
+ g.add_node("NarratorNode", self.narrator_node)
154
+ g.add_node("CriticNode", self.critic_node)
155
+ g.add_node("IdentityManagerNode", self.identity_node)
156
+ g.add_node("BackgroundDescriptorNode", self.background_node)
157
+
158
+ g.set_entry_point("NarratorNode")
159
+ g.add_edge("NarratorNode", "CriticNode")
160
+ g.add_conditional_edges(
161
+ "CriticNode",
162
+ lambda state: "done" if state.get("approved") else "retry",
163
+ {
164
+ "done": END,
165
+ "retry": "IdentityManagerNode",
166
+ },
167
+ )
168
+ g.add_edge("IdentityManagerNode", "BackgroundDescriptorNode")
169
+ g.add_edge("BackgroundDescriptorNode", "CriticNode")
170
+
171
+ return g.compile()
172
+
173
+ # -----------------------------------------------------------
174
+ # Run loop
175
+ # -----------------------------------------------------------
176
+
177
+ def run(self, dialogues_srt: str, frame_descriptions: List[Dict[str, Any]], max_cycles: int = 3) -> NarratorOutput:
178
+ state = {"dialogues_srt": dialogues_srt, "frame_descriptions": frame_descriptions}
179
+ result = self.graph.invoke(state)
180
+ return NarratorOutput(
181
+ narrative_text=result.get("narration", ""),
182
+ srt_text=result.get("narration", ""), # could be parsed separately if model emits dual block
183
+ critic_feedback=result.get("critic_feedback"),
184
+ approved=result.get("approved", False),
185
+ )
remote_clients.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================
2
+ # File: remote_clients.py
3
+ # ============================
4
+ from __future__ import annotations
5
+ from typing import Any, Dict, List, Optional
6
+ import os, json, time
7
+ import requests
8
+ from tenacity import retry, stop_after_attempt, wait_exponential
9
+
10
+ try:
11
+ from gradio_client import Client as GradioClient
12
+ except Exception: # pragma: no cover
13
+ GradioClient = None # type: ignore
14
+
15
+
16
+ class BaseRemoteClient:
17
+ def __init__(self, base_url: str, use_gradio: bool = True, hf_token: Optional[str] = None, timeout: int = 120):
18
+ self.base_url = base_url.rstrip("/")
19
+ self.use_gradio = use_gradio and GradioClient is not None
20
+ self.hf_token = hf_token or os.getenv("HF_TOKEN")
21
+ self.timeout = timeout
22
+ self._client = None
23
+ if self.use_gradio:
24
+ # GradioClient acepta base_url del Space público/privado
25
+ headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else None
26
+ self._client = GradioClient(self.base_url, hf_token=self.hf_token, headers=headers)
27
+
28
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=8))
29
+ def _post_json(self, route: str, payload: Dict[str, Any]) -> Dict[str, Any]:
30
+ url = f"{self.base_url}{route}"
31
+ headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else {}
32
+ r = requests.post(url, json=payload, headers=headers, timeout=self.timeout)
33
+ r.raise_for_status()
34
+ return r.json()
35
+
36
+
37
+ class InstructClient(BaseRemoteClient):
38
+ """Cliente para Space de texto-instrucción (schat)."""
39
+ def generate(self, prompt: str, system: Optional[str] = None, **kwargs) -> str:
40
+ if self.use_gradio and self._client:
41
+ # Asume interfaz Gradio con un único campo de entrada de texto y devuelve texto.
42
+ # Ajusta "predict" y parámetros a tu Space real (inputs/outputs).
43
+ out = self._client.predict(prompt, api_name="/predict")
44
+ return str(out)
45
+ # HTTP genérico
46
+ data = {"prompt": prompt, "system": system, **kwargs}
47
+ res = self._post_json("/generate", data)
48
+ return res.get("text", "")
49
+
50
+
51
+ class VisionClient(BaseRemoteClient):
52
+ """Cliente para Space de visión (svision)."""
53
+ def describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, **kwargs) -> List[str]:
54
+ if self.use_gradio and self._client:
55
+ # En muchos Spaces, Gradio acepta lista de imágenes + texto JSON.
56
+ out = self._client.predict(image_paths, json.dumps(context or {}), api_name="/predict")
57
+ # Debe devolver lista de descripciones (una por imagen)
58
+ if isinstance(out, str):
59
+ try:
60
+ return json.loads(out)
61
+ except Exception:
62
+ return [out]
63
+ return list(out)
64
+ data = {"images": image_paths, "context": context or {}, **kwargs}
65
+ res = self._post_json("/describe", data)
66
+ return res.get("descriptions", [])
67
+
68
+
69
+ class ToolsClient(BaseRemoteClient):
70
+ """Cliente para Space con tool-calling (stools)."""
71
+ def chat(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, **kwargs) -> Dict[str, Any]:
72
+ if self.use_gradio and self._client:
73
+ out = self._client.predict(json.dumps(messages), json.dumps(tools or []), api_name="/predict")
74
+ if isinstance(out, str):
75
+ try:
76
+ return json.loads(out)
77
+ except Exception:
78
+ return {"text": out}
79
+ return out
80
+ data = {"messages": messages, "tools": tools or [], **kwargs}
81
+ return self._post_json("/chat", data)
82
+
83
+
84
+ class ASRClient(BaseRemoteClient):
85
+ """Cliente para Space de ASR catalán (ars)."""
86
+ def transcribe(self, audio_path: str, **kwargs) -> Dict[str, Any]:
87
+ if self.use_gradio and self._client:
88
+ out = self._client.predict(audio_path, api_name="/predict")
89
+ if isinstance(out, str):
90
+ return {"text": out}
91
+ return out
92
+ files = {"file": open(audio_path, "rb")}
93
+ headers = {"Authorization": f"Bearer {self.hf_token}"} if self.hf_token else {}
94
+ r = requests.post(f"{self.base_url}/transcribe", files=files, data=kwargs, headers=headers, timeout=self.timeout)
95
+ r.raise_for_status()
96
+ return r.json()
requirements.txt CHANGED
@@ -1,6 +1,40 @@
1
- fastapi==0.115.0
 
2
  uvicorn[standard]==0.30.6
3
- pydantic==2.9.2
4
  python-multipart==0.0.9
5
- requests==2.32.3
6
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # API
2
+ fastapi==0.114.2
3
  uvicorn[standard]==0.30.6
 
4
  python-multipart==0.0.9
5
+ pydantic>=2.7,<3
6
+ PyYAML>=6.0
7
+ requests>=2.32
8
+ gradio_client>=0.16.0 # para invocar Spaces con interfaz Gradio
9
+
10
+ # Core utils
11
+ numpy>=1.26
12
+ Pillow>=10.4
13
+ opencv-python-headless==4.10.0.84
14
+
15
+ # OCR (elige según tu vision_tools)
16
+ pytesseract>=0.3 # si usas Tesseract
17
+ easyocr>=1.7 # si prefieres EasyOCR
18
+
19
+ # Audio / vídeo (ajusta según audio_tools)
20
+ librosa>=0.10
21
+ soundfile>=0.12
22
+ pydub>=0.25
23
+ ffmpeg-python>=0.2
24
+ # (recuerda que el binario de ffmpeg no llega por pip; si lo necesitas, usa Docker o runtime con ffmpeg instalado)
25
+
26
+ # ML ligero para clustering OCR
27
+ scikit-learn>=1.5
28
+ sentence-transformers>=3.0
29
+ transformers>=4.44
30
+ # Torch CPU (para sentence-transformers). En Spaces CPU, instala la variante de CPU:
31
+ torch>=2.3,<3
32
+
33
+ # Vector DB
34
+ chromadb>=0.5.4
35
+
36
+ # (Opcional) moviepy si tu vision_tools la usa
37
+ moviepy>=2.0
38
+
39
+ # (Opcional) herramientas de robustez
40
+ tenacity>=8.2
scripts/client_example.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =====================================
2
+ # File: client_example.py (opcional)
3
+ # =====================================
4
+ import requests
5
+
6
+ class VeureuEngineClient:
7
+ def __init__(self, base_url: str):
8
+ self.base = base_url.rstrip("/")
9
+
10
+ def process_video(self, video_path: str, **kwargs):
11
+ with open(video_path, "rb") as f:
12
+ files = {"video_file": (Path(video_path).name, f, "video/mp4")}
13
+ data = {"config_path": kwargs.get("config_path", "config_veureu.yaml"),
14
+ "out_root": kwargs.get("out_root", "results"),
15
+ "db_dir": kwargs.get("db_dir", "chroma_db")}
16
+ r = requests.post(f"{self.base}/process_video", files=files, data=data, timeout=3600)
17
+ r.raise_for_status()
18
+ return r.json()
19
+
20
+ def load_casting(self, faces_dir: str, voices_dir: str, db_dir: str = "chroma_db", drop_collections: bool = False):
21
+ data = {"faces_dir": faces_dir, "voices_dir": voices_dir, "db_dir": db_dir, "drop_collections": str(drop_collections)}
22
+ r = requests.post(f"{self.base}/load_casting", data=data, timeout=600)
23
+ r.raise_for_status(); return r.json()
24
+
25
+ def refine_narration(self, dialogues_srt: str, frame_descriptions: list, model_url: str, une_guidelines_path: str):
26
+ data = {
27
+ "dialogues_srt": dialogues_srt,
28
+ "frame_descriptions_json": json.dumps(frame_descriptions, ensure_ascii=False),
29
+ "model_url": model_url,
30
+ "une_guidelines_path": une_guidelines_path,
31
+ }
32
+ r = requests.post(f"{self.base}/refine_narration", data=data, timeout=600)
33
+ r.raise_for_status(); return r.json()
video_processing.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==================================
2
+ # File: video_processing_refactor.py
3
+ # (drop-in replacement for process_video_pipeline in video_processing.py)
4
+ # ==================================
5
+ from __future__ import annotations
6
+ from typing import Any, Dict, List, Optional
7
+ from pathlib import Path
8
+ import json
9
+ import cv2
10
+ import yaml
11
+ import logging
12
+
13
+ from chromadb.config import Settings
14
+ import chromadb
15
+
16
+ from audio_tools import process_audio_for_video
17
+ from background_descriptor import build_keyframes_and_per_second, describe_keyframes_with_llm
18
+ from identity_manager import IdentityManager
19
+
20
+
21
+ log = logging.getLogger("video_processing")
22
+ if not log.handlers:
23
+ h = logging.StreamHandler(); h.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
24
+ log.addHandler(h)
25
+ log.setLevel(logging.INFO)
26
+
27
+
28
+ def _ensure_dir(p: Path) -> Path:
29
+ p.mkdir(parents=True, exist_ok=True)
30
+ return p
31
+
32
+
33
+ def _ensure_chroma(db_dir: str | Path):
34
+ _ensure_dir(Path(db_dir))
35
+ return chromadb.Client(Settings(
36
+ persist_directory=str(db_dir),
37
+ chroma_db_impl="duckdb+parquet",
38
+ anonymized_telemetry=False,
39
+ ))
40
+
41
+
42
+ def load_config(path: str) -> Dict[str, Any]:
43
+ p = Path(path)
44
+ if not p.exists():
45
+ return {}
46
+ return yaml.safe_load(p.read_text(encoding="utf-8")) or {}
47
+
48
+
49
+ def process_video_pipeline(
50
+ video_path: str,
51
+ *,
52
+ config_path: str = "config_veureu.yaml",
53
+ out_root: str = "results",
54
+ db_dir: str = "chroma_db",
55
+ ) -> Dict[str, Any]:
56
+ cfg = load_config(config_path)
57
+ out_dir = _ensure_dir(Path(out_root) / Path(video_path).stem)
58
+
59
+ # Metadatos del vídeo
60
+ cap = cv2.VideoCapture(str(video_path))
61
+ if not cap.isOpened():
62
+ raise RuntimeError(f"Cannot open video: {video_path}")
63
+ fps = float(cap.get(cv2.CAP_PROP_FPS)) or 25.0
64
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) or 0
65
+ duration = (total_frames / fps) if total_frames > 0 else 0.0
66
+ cap.release()
67
+
68
+ # DB Chroma opcional
69
+ face_col = voice_col = None
70
+ if cfg.get("database", {}).get("enabled", True):
71
+ client = _ensure_chroma(cfg.get("database", {}).get("persist_directory", db_dir))
72
+ if cfg.get("database", {}).get("enable_face_recognition", True):
73
+ try:
74
+ face_col = client.get_collection(cfg.get("database", {}).get("face_collection", "index_faces"))
75
+ except Exception:
76
+ face_col = None
77
+ if cfg.get("database", {}).get("enable_voice_recognition", True):
78
+ try:
79
+ voice_col = client.get_collection(cfg.get("database", {}).get("voice_collection", "index_voices"))
80
+ except Exception:
81
+ voice_col = None
82
+
83
+ # 1) Background descriptor (frames, OCR, descripciones)
84
+ keyframes, per_second, _ = build_keyframes_and_per_second(video_path, out_dir, cfg, face_collection=face_col)
85
+
86
+ # Ajustar `end` de cada keyframe con la duración total
87
+ for i in range(len(keyframes)):
88
+ if i < len(keyframes) - 1:
89
+ keyframes[i]["end"] = keyframes[i + 1]["start"]
90
+ else:
91
+ keyframes[i]["end"] = round(duration, 2)
92
+
93
+ # 2) Descripción con LLM
94
+ face_identities = {f.get("identity") for fr in per_second for f in (fr.get("faces") or []) if f.get("identity")}
95
+ keyframes, montage_path = describe_keyframes_with_llm(keyframes, out_dir, face_identities=face_identities, config_path=config_path)
96
+
97
+ # 3) Audio pipeline
98
+ audio_segments, srt_unmodified_path, full_transcription = process_audio_for_video(video_path=str(video_path), out_dir=out_dir, cfg=cfg, voice_collection=voice_col)
99
+
100
+ # 4) Identity manager: enriquecer frames y clips
101
+ im = IdentityManager(face_collection=face_col, voice_collection=voice_col)
102
+ per_second = im.assign_faces_to_frames(per_second)
103
+ keyframes = im.assign_faces_to_frames(keyframes)
104
+ audio_segments = im.assign_voices_to_segments(audio_segments, distance_threshold=cfg.get("voice_processing", {}).get("speaker_identification", {}).get("distance_threshold"))
105
+
106
+ # 5) Mapear identidades a rangos
107
+ keyframes = im.map_identities_over_ranges(per_second, keyframes, key="faces", out_key="persona")
108
+ audio_segments = im.map_identities_over_ranges(per_second, audio_segments, key="faces", out_key="persona")
109
+
110
+ # 6) Export analysis.json
111
+ frames_analysis = [{
112
+ "frame_number": fr.get("id"),
113
+ "start": fr.get("start"),
114
+ "end": fr.get("end"),
115
+ "ocr": fr.get("ocr", ""),
116
+ "persona": fr.get("persona", []),
117
+ "description": fr.get("description", ""),
118
+ } for fr in keyframes]
119
+
120
+ analysis = {
121
+ "frames": frames_analysis,
122
+ "audio_segments": [{k: v for k, v in seg.items() if k != "voice_embedding"} for seg in audio_segments],
123
+ "full_transcription": full_transcription,
124
+ }
125
+ analysis_path = out_dir / f"{Path(video_path).stem}_analysis.json"
126
+ analysis_path.write_text(json.dumps(analysis, indent=2, ensure_ascii=False), encoding="utf-8")
127
+
128
+ return {
129
+ "output_dir": str(out_dir),
130
+ "files": {
131
+ "montage_path": montage_path,
132
+ "srt_path": srt_unmodified_path,
133
+ "analysis_path": str(analysis_path),
134
+ },
135
+ "stats": {
136
+ "duration_seconds": duration,
137
+ "total_frames": total_frames,
138
+ "frames_processed": len(keyframes),
139
+ "audio_segments_processed": len(audio_segments),
140
+ },
141
+ }
vision_tools.py ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vision_tools.py
2
+ # -----------------------------------------------------------------------------
3
+ # Veureu — VISION utilities (self-contained)
4
+ # - Image processing and analysis
5
+ # - Object detection and recognition
6
+ # - Face detection and recognition
7
+ # - Scene description
8
+ # - Montage sequence analysis
9
+ # -----------------------------------------------------------------------------
10
+ from __future__ import annotations
11
+
12
+
13
+ import os
14
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
15
+
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional, Tuple
19
+
20
+ import json
21
+ import logging
22
+ import math
23
+ import os
24
+ import shlex
25
+ import subprocess
26
+
27
+ import numpy as np
28
+ import torch
29
+ import torchaudio
30
+ import torchaudio.transforms as T
31
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
32
+ from pyannote.audio import Pipeline as PyannotePipeline
33
+ from speechbrain.pretrained import SpeakerRecognition
34
+ from pydub import AudioSegment
35
+ from sklearn.cluster import KMeans
36
+ from sklearn.metrics import silhouette_score
37
+ from scenedetect import VideoManager, SceneManager
38
+ from scenedetect.detectors import ContentDetector
39
+
40
+ import os, base64, requests, subprocess, contextlib, time
41
+
42
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
43
+ from PIL import Image
44
+
45
+ from libs.audio_tools_ana_2 import process_audio_for_video
46
+
47
+ import cv2
48
+
49
+ try:
50
+ import face_recognition # type: ignore
51
+ except Exception:
52
+ face_recognition = None # type: ignore
53
+
54
+ try:
55
+ # Utility wrapper for DeepFace
56
+ from libs.face_utils import FaceRecognizer as DFRecognizer
57
+ except Exception:
58
+ try:
59
+ from face_utils import FaceRecognizer as DFRecognizer
60
+ except Exception:
61
+ DFRecognizer = None # type: ignore
62
+
63
+ try:
64
+ from deepface import DeepFace
65
+ except ImportError:
66
+ DeepFace = None
67
+
68
+ import easyocr
69
+
70
+ # -------------------------------- Logging ------------------------------------
71
+ log = logging.getLogger("audio_tools")
72
+ if not log.handlers:
73
+ h = logging.StreamHandler()
74
+ h.setFormatter(logging.Formatter("[%(levelname)s] %(message)s"))
75
+ log.addHandler(h)
76
+ log.setLevel(logging.INFO)
77
+
78
+ # ============================ UTILS ===========================================
79
+ def load_config(path: str = "configs/config_veureu.yaml") -> Dict[str, Any]:
80
+ p = Path(path)
81
+ if not p.exists():
82
+ log.warning("Config file not found: %s (using defaults)", path)
83
+ return {}
84
+ try:
85
+ import yaml
86
+ cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
87
+ cfg["__path__"] = str(p)
88
+ return cfg
89
+ except Exception as e:
90
+ log.error("Failed to read YAML config: %s", e)
91
+ return {}
92
+
93
+ # ---------------------------- IMAGE EMBEDDING ----------------------------------
94
+ class FaceOfImageEmbedding:
95
+ """Preferred backend: `face_recognition`; fallback: DeepFace via libs.face_utils."""
96
+ def __init__(self, deepface_model: str = 'Facenet512'):
97
+ self.use_fr = face_recognition is not None
98
+ self.df = None
99
+ if not self.use_fr and DFRecognizer is not None:
100
+ try:
101
+ self.df = DFRecognizer(model_name=deepface_model)
102
+ log.info("Using DeepFace (%s) as face embedding backend.", deepface_model)
103
+ except Exception as e:
104
+ log.warning("Failed to initialize DeepFace: %s", e)
105
+ elif self.use_fr:
106
+ log.info("Using face_recognition as face embedding backend.")
107
+ else:
108
+ log.error("No face embedding backend available.")
109
+
110
+ def encode_image(self, image_path: Path) -> Optional[List[float]]:
111
+ import numpy as np
112
+ try:
113
+ if self.use_fr:
114
+ img = face_recognition.load_image_file(str(image_path)) # type: ignore
115
+ encs = face_recognition.face_encodings(img)
116
+ if encs:
117
+ # Normalizar cada embedding a norma 1
118
+ embeddings = [(e / np.linalg.norm(e)).astype(float).tolist() for e in encs]
119
+ return embeddings
120
+ return None
121
+
122
+ if self.df is not None:
123
+ emb = self.df.get_face_embedding_from_path(str(image_path))
124
+ if emb is None:
125
+ return None
126
+ # Convertir a numpy array y normalizar
127
+ emb = np.array(emb, dtype=float)
128
+ emb = emb / np.linalg.norm(emb)
129
+ return emb.tolist()
130
+
131
+ except Exception as e:
132
+ log.debug("Fallo embedding cara %s: %s", image_path, e)
133
+
134
+ return None
135
+
136
+ class FaceAnalyzer:
137
+ """Wrapper sencillo para DeepFace que obtiene edad y género de una imagen."""
138
+ def __init__(self, actions=None):
139
+ if actions is None:
140
+ actions = ["age", "gender"]
141
+ self.actions = actions
142
+
143
+ def analyze_image(self, img_path: str) -> Optional[Dict[str, Any]]:
144
+ try:
145
+ result = DeepFace.analyze(img_path=img_path, actions=self.actions)
146
+
147
+ # Si DeepFace devuelve una lista (varias caras), tomamos la primera
148
+ if isinstance(result, list) and len(result) > 0:
149
+ result = result[0]
150
+
151
+ # Ahora sí podemos acceder a 'age' y 'dominant_gender'
152
+ return {
153
+ "age": result.get("age", "unknown"),
154
+ "gender": result.get("dominant_gender", "unknown")
155
+ }
156
+
157
+ except Exception as e:
158
+ log.warning("No se pudo analizar la imagen %s: %s", img_path, e)
159
+ return None
160
+
161
+ # ----------------------------------- FUNCTIONS -------------------------------------
162
+ def map_identities_per_second(frames_per_second, intervals):
163
+ for seg in intervals:
164
+ seg_start = seg["start"]
165
+ seg_end = seg["end"]
166
+
167
+ # recolectar identidades de los frames en el rango del segmento
168
+ identities = []
169
+ for f in frames_per_second:
170
+ if seg_start <= f["start"] <= seg_end:
171
+ for face in f.get("faces", []):
172
+ identities.append(face)
173
+
174
+ # contar apariciones
175
+ seg["counts"] = dict(Counter(identities))
176
+
177
+ return intervals
178
+
179
+ def _split_montage(img: np.ndarray, n: int, cfg: Dict[str, Any]) -> List[np.ndarray]:
180
+ vd = cfg.get('vision_describer', {})
181
+ montage_cfg = vd.get('montage', {})
182
+ mode = montage_cfg.get('split_mode', 'horizontal') # 'horizontal'|'vertical'|'grid'
183
+
184
+ h, w = img.shape[:2]
185
+ tiles: List[np.ndarray] = []
186
+
187
+ if mode == 'vertical':
188
+ tile_h = h // n
189
+ for i in range(n):
190
+ y0 = i * tile_h; y1 = h if i == n-1 else (i+1) * tile_h
191
+ tiles.append(img[y0:y1, 0:w])
192
+ return tiles
193
+
194
+ if mode == 'grid':
195
+ rows = int(montage_cfg.get('rows', 1) or 1)
196
+ cols = int(montage_cfg.get('cols', n) or n)
197
+ assert rows * cols >= n, "grid rows*cols must be >= n"
198
+ tile_h = h // rows; tile_w = w // cols
199
+ k = 0
200
+ for r in range(rows):
201
+ for c in range(cols):
202
+ if k >= n: break
203
+ y0, y1 = r*tile_h, h if (r==rows-1) else (r+1)*tile_h
204
+ x0, x1 = c*tile_w, w if (c==cols-1) else (c+1)*tile_w
205
+ tiles.append(img[y0:y1, x0:x1]); k += 1
206
+ return tiles
207
+
208
+ tile_w = w // n
209
+ for i in range(n):
210
+ x0 = i * tile_w; x1 = w if i == n-1 else (i+1) * tile_w
211
+ tiles.append(img[0:h, x0:x1])
212
+ return tiles
213
+
214
+ def generar_montage(frame_paths: List[str], output_dir: str) -> None:
215
+ output_path = Path(output_dir)
216
+ output_path.mkdir(parents=True, exist_ok=True)
217
+ montage_path = ""
218
+
219
+ if frame_paths:
220
+ imgs = [cv2.imread(kf) for kf in frame_paths if os.path.exists(kf)]
221
+ imgs = [img for img in imgs if img is not None]
222
+ print(f"Se encontraron {len(imgs)} imágenes para el montaje.")
223
+
224
+ if imgs:
225
+ h = max(img.shape[0] for img in imgs) # altura máxima
226
+ imgs_resized = [cv2.resize(img, (int(img.shape[1]*h/img.shape[0]), h)) for img in imgs]
227
+ montage = cv2.hconcat(imgs_resized)
228
+ montage_path = os.path.join(output_dir, "keyframes_montage.jpg")
229
+ print(f"Guardando montaje en: {montage_path}")
230
+ cv2.imwrite(montage_path, montage)
231
+ print("Montaje guardado.")
232
+ else:
233
+ print("No se encontraron imágenes válidas para el montaje.")
234
+
235
+ return montage_path
236
+
237
+ def describe_montage_sequence(
238
+ montage_path: str,
239
+ n: int,
240
+ informacion,
241
+ face_identities,
242
+ *,
243
+ config_path: str = 'configs/config_veureu.yaml'
244
+ ) -> Dict[str, Any]:
245
+ """Describe each sub-image of a montage.
246
+
247
+ Inputs
248
+ ------
249
+ montage_path: str
250
+ Path to a composite image made of n sub-ismages placed sequentially.
251
+ n: int
252
+ Number of sub-images to split and describe.
253
+ config_path: str
254
+ Path to YAML with 'vision_describer' configuration (provider and params).
255
+
256
+ Returns
257
+ -------
258
+ list []: with the descripcion of each image
259
+ """
260
+
261
+ path_model = "BSC-LT/salamandra-7b-vision"
262
+
263
+ processor = AutoProcessor.from_pretrained(path_model)
264
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
265
+ path_model,
266
+ torch_dtype=torch.float16,
267
+ low_cpu_mem_usage=True
268
+ ).to("cuda")
269
+
270
+ img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
271
+ if img is None:
272
+ raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
273
+
274
+ cfg = load_config(config_path)
275
+ tiles = _split_montage(img, n, cfg)
276
+ if len(tiles) < n:
277
+ raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
278
+
279
+ # Convertir cada tile a PIL Image
280
+ tile_images = [Image.fromarray(cv2.cvtColor(t, cv2.COLOR_BGR2RGB)) for t in tiles]
281
+
282
+ sys_prompt = (
283
+ "Ets un expert en narrativa visual. "
284
+ "Descriu la imatge de manera molt breu i senzilla, en català, "
285
+ "explicant només l’acció principal que s’hi veu. "
286
+ "Respon amb una sola frase curta (10–20 paraules com a màxim), "
287
+ "sense afegir detalls innecessaris ni descriure l’entorn."
288
+ )
289
+
290
+ all_results = []
291
+
292
+ for i in range(len(tile_images)):
293
+ batch = [tile_images[i]] # lista con un solo tile
294
+
295
+ conversation = [
296
+ {"role": "system", "content": sys_prompt},
297
+ {"role": "user", "content": [
298
+ {"type": "image", "image": batch[0]},
299
+ {"type": "text", "text": (
300
+ f"Descriu la imatge de manera molt breu i senzilla, en català. ")}
301
+ ]}
302
+ ]
303
+ prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
304
+
305
+ inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
306
+ for k, v in inputs.items():
307
+ if v.dtype.is_floating_point:
308
+ inputs[k] = v.to("cuda", torch.float16)
309
+ else:
310
+ inputs[k] = v.to("cuda")
311
+
312
+ output = model.generate(**inputs, max_new_tokens=1024)
313
+ text = processor.decode(output[0], skip_special_tokens=True)
314
+ lines = text.split("\n")
315
+
316
+ desc = ""
317
+ for i, line in enumerate(lines):
318
+ if line.lower().startswith(" assistant"):
319
+ desc = "\n".join(lines[i+1:]).strip()
320
+ break
321
+
322
+ all_results.append(desc)
323
+ torch.cuda.empty_cache()
324
+
325
+ return all_results
326
+
327
+ # --------------------------- IMAGES EXTRACTION -----------------------------
328
+ def keyframe_conditional_extraction_ana(
329
+ video_path,
330
+ output_dir,
331
+ threshold=30.0,
332
+ offset_frames=10
333
+ ):
334
+ """
335
+ Detecta cambios de escena en un vídeo, guarda un fotograma por cada cambio,
336
+ devuelve intervalos con start y end basados en los tiempos de los keyframes
337
+ y genera un montaje con todos los keyframes.
338
+ """
339
+ if not os.path.exists(output_dir):
340
+ os.makedirs(output_dir)
341
+
342
+ video_manager = VideoManager([video_path])
343
+ scene_manager = SceneManager()
344
+ scene_manager.add_detector(ContentDetector(threshold=threshold))
345
+
346
+ video_manager.start()
347
+ scene_manager.detect_scenes(video_manager)
348
+
349
+ scene_list = scene_manager.get_scene_list()
350
+
351
+ cap = cv2.VideoCapture(video_path)
352
+ fps = cap.get(cv2.CAP_PROP_FPS)
353
+ total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
354
+ video_duration = total_frames / fps
355
+
356
+ keyframes = []
357
+ for i, (start_time, end_time) in enumerate(scene_list):
358
+ frame_number = int(start_time.get_frames()) + offset_frames
359
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
360
+ ret, frame = cap.read()
361
+ if ret:
362
+ ts = frame_number / fps
363
+ frame_path = os.path.join(output_dir, f"scene_{i+1:03d}.jpg")
364
+ cv2.imwrite(frame_path, frame)
365
+ keyframes.append({
366
+ "index": i+1,
367
+ "time": round(ts, 2),
368
+ "path": frame_path
369
+ })
370
+
371
+ cap.release()
372
+ video_manager.release()
373
+
374
+ # Construimos intervalos con start y end
375
+ intervals = []
376
+ for i, kf in enumerate(keyframes):
377
+ start = kf["time"]
378
+ if i < len(keyframes) - 1:
379
+ end = keyframes[i+1]["time"]
380
+ else:
381
+ end = video_duration # última escena hasta el final
382
+ intervals.append({
383
+ "index": kf["index"],
384
+ "start": start,
385
+ "end": round(end, 2),
386
+ "path": kf["path"]
387
+ })
388
+
389
+ return intervals
390
+
391
+ def keyframe_every_second(
392
+ video_path: str,
393
+ output_dir: str = ".",
394
+ max_frames: Optional[int] = 10000,
395
+ ) -> List[dict]:
396
+ """
397
+ Extrae un fotograma por cada segundo del video.
398
+
399
+ Returns:
400
+ List[dict]: Cada elemento es {"index", "start", "end", "path"}
401
+ """
402
+ out_dir = Path(output_dir)
403
+ out_dir.mkdir(parents=True, exist_ok=True)
404
+
405
+ cap = cv2.VideoCapture(str(video_path))
406
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
407
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
408
+ duration = total_frames / fps
409
+
410
+ frames: List[dict] = []
411
+ idx = 0
412
+ sec = 0.0
413
+
414
+ while sec <= duration:
415
+ frame_number = int(sec * fps)
416
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
417
+ ret, frame = cap.read()
418
+ if not ret:
419
+ break
420
+
421
+ timestamp = frame_number / fps
422
+ frame_path = out_dir / f"frame_per_second{idx:03d}.jpg"
423
+ cv2.imwrite(str(frame_path), frame)
424
+
425
+ frames.append({
426
+ "index": idx + 1,
427
+ "start": round(timestamp, 2),
428
+ "end": None, # lo completamos después
429
+ "path": str(frame_path),
430
+ })
431
+
432
+ idx += 1
433
+ sec += 1.0
434
+
435
+ if max_frames and idx >= max_frames:
436
+ break
437
+
438
+ cap.release()
439
+
440
+ # Completar los "end" con el inicio del siguiente frame
441
+ for i in range(len(frames)):
442
+ if i < len(frames) - 1:
443
+ frames[i]["end"] = frames[i+1]["start"]
444
+ else:
445
+ frames[i]["end"] = round(duration, 2)
446
+
447
+ return frames
448
+
449
+ from collections import Counter, defaultdict
450
+
451
+ # --------------------------- FRAMES PROCESSING -----------------------------
452
+ def process_frames(
453
+ frames: List[dict], # cada elemento es {"index", "start", "end", "path"}
454
+ config: dict,
455
+ face_col=None,
456
+ embedding_model=None,
457
+ ) -> Tuple[List[dict], List[int]]:
458
+ """
459
+ Procesa keyframes:
460
+ - Detecta caras
461
+ - Genera embeddings con FaceEmbedding
462
+ - Opcionalmente compara con face_col (KNN top-3)
463
+ - Opcionalmente ejecuta OCR
464
+ """
465
+
466
+ frame_results = []
467
+
468
+ # Crear embedding_model si no se pasa
469
+ if embedding_model is None:
470
+ from libs.vision_tools import FaceOfImageEmbedding
471
+ embedding_model = FaceOfImageEmbedding()
472
+
473
+ for idx, frame in enumerate(frames):
474
+ frame_path = frame["path"]
475
+
476
+ try:
477
+ raw_faces = embedding_model.encode_image(Path(frame_path))
478
+ except Exception as e:
479
+ print(f"Error procesando {frame_path}: {e}")
480
+ raw_faces = None
481
+
482
+ faces = []
483
+ if raw_faces is not None:
484
+ if isinstance(raw_faces[0], list): # múltiples
485
+ for e in raw_faces:
486
+ faces.append({"embedding": e})
487
+ else: # uno solo
488
+ faces.append({"embedding": raw_faces})
489
+
490
+ faces_detected = []
491
+ for f in faces:
492
+ embedding = f.get("embedding")
493
+ identity = "Unknown"
494
+ knn = []
495
+
496
+ if face_col is not None and embedding is not None:
497
+ try:
498
+ num_embeddings = face_col.count()
499
+ if num_embeddings < 1:
500
+ knn = []
501
+ identity = "Unknown"
502
+
503
+ else:
504
+ n_results = min(3, num_embeddings)
505
+ q = face_col.query(
506
+ query_embeddings=[embedding],
507
+ n_results=n_results,
508
+ include=["metadatas", "distances"]
509
+ )
510
+
511
+ knn = []
512
+ metas = q.get("metadatas", [[]])[0]
513
+ dists = q.get("distances", [[]])[0]
514
+ for meta, dist in zip(metas, dists):
515
+ person_id = meta.get("identity", "Unknown") if isinstance(meta, dict) else "Unknown"
516
+ knn.append({"identity": person_id, "distance": float(dist)})
517
+
518
+ if knn and knn[0]["distance"] < 0.6:
519
+ identity = knn[0]["identity"]
520
+ else:
521
+ identity = "Unknown"
522
+
523
+ except Exception as e:
524
+ print(f"Face KNN failed: {e}")
525
+ knn = []
526
+ identity = "Unknown"
527
+
528
+ faces_detected.append(identity)
529
+
530
+ use_easyocr = True
531
+ if use_easyocr:
532
+ try:
533
+ reader = easyocr.Reader(['en', 'es'], gpu=True) # Cambiar gpu=False si no hay GPU
534
+ results = reader.readtext(frame_path)
535
+ ocr_text_easyocr = " ".join([text for _, text, _ in results]).strip()
536
+
537
+ except Exception as e:
538
+ print(f"OCR error: {e}")
539
+
540
+ frame_results.append({
541
+ "id": frame["index"],
542
+ "start": frame["start"],
543
+ "end": frame["end"],
544
+ "image_path": frame_path,
545
+ "faces": faces_detected,
546
+ "ocr": ocr_text_easyocr,
547
+ })
548
+
549
+ return frame_results
550
+
551
+ if __name__ == "__main__":
552
+ import argparse
553
+ ap = argparse.ArgumentParser(description="Veureu — Audio tools (self-contained)")
554
+ ap.add_argument("--video", required=True)
555
+ ap.add_argument("--out", default="results")
556
+ ap.add_argument("--config", default="configs/config_veureu.yaml")
557
+ args = ap.parse_args()
558
+
559
+ # Lightweight config loader (only for sample run)
560
+ import yaml
561
+ cfg = {}
562
+ p = Path(args.config)
563
+ if p.exists():
564
+ cfg = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
565
+
566
+ out_dir = Path(args.out) / Path(args.video).stem
567
+ out_dir.mkdir(parents=True, exist_ok=True)
568
+
569
+ segs, srt = process_audio_for_video(args.video, out_dir, cfg, voice_collection=None)
570
+ print(json.dumps({
571
+ "segments": len(segs),
572
+ "srt": srt
573
+ }, indent=2, ensure_ascii=False))