VeuReu commited on
Commit
ed7696e
verified
1 Parent(s): 02fa79a

Upload 4 files

Browse files
Files changed (3) hide show
  1. api.py +41 -4
  2. audio_tools.py +39 -8
  3. llm_router.py +35 -4
api.py CHANGED
@@ -13,7 +13,7 @@ from enum import Enum
13
  import os
14
 
15
  from video_processing import process_video_pipeline
16
- from audio_tools import process_audio_for_video
17
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
18
  from narration_system import NarrationSystem
19
  from llm_router import load_yaml, LLMRouter
@@ -181,8 +181,8 @@ def process_video_job(job_id: str):
181
  epsilon=epsilon,
182
  min_cluster_size=min_cluster_size,
183
  video_name=video_name,
184
- start_offset_sec=5.0,
185
- extract_every_sec=0.5
186
  )
187
 
188
  print(f"[{job_id}] DEBUG - result completo: {result}")
@@ -231,11 +231,47 @@ def process_video_job(job_id: str):
231
  # Procesamiento de audio: diarizaci贸n, ASR y embeddings de voz
232
  try:
233
  cfg = load_yaml("config.yaml")
234
- audio_segments, srt_unmod, full_txt = process_audio_for_video(video_path, base, cfg, voice_collection=None)
 
 
 
 
 
 
 
 
235
  except Exception as e_audio:
236
  import traceback
237
  print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
238
  audio_segments, srt_unmod, full_txt = [], None, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  # Clustering de voces (DBSCAN sobre embeddings v谩lidos)
241
  from sklearn.cluster import DBSCAN
@@ -266,6 +302,7 @@ def process_video_job(job_id: str):
266
  "full_transcription": full_txt,
267
  "voice_labels": v_labels,
268
  "num_voice_embeddings": len(voice_embeddings),
 
269
  }
270
  job["status"] = JobStatus.DONE
271
 
 
13
  import os
14
 
15
  from video_processing import process_video_pipeline
16
+ from audio_tools import process_audio_for_video, extract_audio_ffmpeg, embed_voice_segments
17
  from casting_loader import ensure_chroma, build_faces_index, build_voices_index
18
  from narration_system import NarrationSystem
19
  from llm_router import load_yaml, LLMRouter
 
181
  epsilon=epsilon,
182
  min_cluster_size=min_cluster_size,
183
  video_name=video_name,
184
+ start_offset_sec=0.5,
185
+ extract_every_sec=0.25
186
  )
187
 
188
  print(f"[{job_id}] DEBUG - result completo: {result}")
 
231
  # Procesamiento de audio: diarizaci贸n, ASR y embeddings de voz
232
  try:
233
  cfg = load_yaml("config.yaml")
234
+ audio_segments, srt_unmod, full_txt, diar_info, connection_logs = process_audio_for_video(video_path, base, cfg, voice_collection=None)
235
+ # Loggear en consola del engine los eventos de conexi贸n
236
+ try:
237
+ for ev in (connection_logs or []):
238
+ msg = ev.get("message") if isinstance(ev, dict) else None
239
+ if msg:
240
+ print(f"[{job_id}] {msg}")
241
+ except Exception:
242
+ pass
243
  except Exception as e_audio:
244
  import traceback
245
  print(f"[{job_id}] WARN - Audio pipeline failed: {e_audio}\n{traceback.format_exc()}")
246
  audio_segments, srt_unmod, full_txt = [], None, ""
247
+ diar_info = {"diarization_ok": False, "error": str(e_audio)}
248
+ connection_logs = []
249
+
250
+ # Fallback: si no hay segmentos de audio, crear uno m铆nimo del audio completo
251
+ if not audio_segments:
252
+ try:
253
+ from pathlib import Path as _P
254
+ from pydub import AudioSegment as _AS
255
+ wav_out = extract_audio_ffmpeg(video_path, base / f"{_P(video_path).stem}.wav", sr=16000)
256
+ audio = _AS.from_wav(wav_out)
257
+ clips_dir = base / "clips"
258
+ clips_dir.mkdir(parents=True, exist_ok=True)
259
+ cp = clips_dir / "segment_000.wav"
260
+ audio.export(cp, format="wav")
261
+ emb_list = embed_voice_segments([str(cp)])
262
+ audio_segments = [{
263
+ "segment": 0,
264
+ "start": 0.0,
265
+ "end": float(len(audio) / 1000.0),
266
+ "speaker": "SPEAKER_00",
267
+ "text": "",
268
+ "voice_embedding": emb_list[0] if emb_list else [],
269
+ "clip_path": str(cp),
270
+ "lang": "ca",
271
+ "lang_prob": 1.0,
272
+ }]
273
+ except Exception as _efb:
274
+ print(f"[{job_id}] WARN - Audio minimal fallback failed: {_efb}")
275
 
276
  # Clustering de voces (DBSCAN sobre embeddings v谩lidos)
277
  from sklearn.cluster import DBSCAN
 
302
  "full_transcription": full_txt,
303
  "voice_labels": v_labels,
304
  "num_voice_embeddings": len(voice_embeddings),
305
+ "diarization_info": diar_info,
306
  }
307
  job["status"] = JobStatus.DONE
308
 
audio_tools.py CHANGED
@@ -139,21 +139,36 @@ def diarize_audio(
139
  min_segment_duration: float = 20.0,
140
  max_segment_duration: float = 50.0,
141
  hf_token_env: str | None = None,
142
- ) -> Tuple[List[str], List[Dict[str, Any]]]:
143
- """Diarization with pyannote and clip export with pydub."""
 
 
144
  from pydub import AudioSegment
145
  audio = AudioSegment.from_wav(wav_path)
146
  duration = len(audio) / 1000.0
147
 
148
  diarization = None
 
 
149
  try:
 
 
 
 
 
 
150
  pipeline = Pipeline.from_pretrained(
151
  "pyannote/speaker-diarization-3.1",
152
- use_auth_token=(hf_token_env or os.getenv("HF_TOKEN"))
153
  )
 
154
  diarization = pipeline(wav_path)
 
 
155
  except Exception as e:
156
  log.warning(f"Diarization unavailable, using single full segment fallback: {e}")
 
 
157
 
158
  clips_dir = (base_dir / clips_folder)
159
  clips_dir.mkdir(parents=True, exist_ok=True)
@@ -203,11 +218,17 @@ def diarize_audio(
203
  if not segments:
204
  cp = clips_dir / "segment_000.wav"
205
  audio.export(cp, format="wav")
206
- return [str(cp)], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}]
 
 
 
 
 
 
207
 
208
  pairs = sorted(zip(clip_paths, segments), key=lambda x: x[1]["start"])
209
  clip_paths, segments = [p[0] for p in pairs], [p[1] for p in pairs]
210
- return clip_paths, segments
211
 
212
  # ------------------------------ Voice embeddings -----------------------------
213
 
@@ -395,7 +416,7 @@ def process_audio_for_video(
395
  out_dir: Path,
396
  cfg: Dict[str, Any],
397
  voice_collection=None,
398
- ) -> Tuple[List[Dict[str, Any]], Optional[str], str]:
399
  """
400
  Audio pipeline: FFmpeg -> diarization -> remote ASR (full + clips) -> embeddings -> speaker-ID -> SRT.
401
  Returns (audio_segments, srt_path or None, full_transcription_text).
@@ -409,21 +430,31 @@ def process_audio_for_video(
409
  diar_cfg = audio_cfg.get("diarization", {})
410
  min_dur = float(diar_cfg.get("min_segment_duration", 20.0))
411
  max_dur = float(diar_cfg.get("max_segment_duration", 50.0))
412
- clip_paths, diar_segs = diarize_audio(wav_path, out_dir, "clips", min_dur, max_dur)
413
  log.info("Clips de audio generados.")
414
 
415
  full_transcription = ""
416
  asr_section = cfg.get("asr", {})
417
  if asr_section.get("enable_full_transcription", True):
418
  log.info("Transcripci贸n completa (remota, Space 'asr')...")
 
 
 
419
  full_res = transcribe_audio_remote(wav_path, cfg)
 
 
420
  full_transcription = full_res.get("text", "") or ""
421
  log.info("Transcripci贸n completa finalizada.")
422
 
423
  log.info("Transcripci贸n por clip (remota, Space 'asr')...")
424
  trans: List[str] = []
425
  for cp in clip_paths:
 
 
 
426
  res = transcribe_audio_remote(cp, cfg)
 
 
427
  trans.append(res.get("text", ""))
428
 
429
  log.info("Se han transcrito todos los clips.")
@@ -467,4 +498,4 @@ def process_audio_for_video(
467
  log.warning(f"SRT generation failed: {e}")
468
  srt_unmodified_path = None
469
 
470
- return audio_segments, srt_unmodified_path, full_transcription
 
139
  min_segment_duration: float = 20.0,
140
  max_segment_duration: float = 50.0,
141
  hf_token_env: str | None = None,
142
+ ) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
143
+ """Diarization with pyannote and clip export with pydub.
144
+ Returns (clip_paths, segments, info) where info includes diarization_ok and optional error.
145
+ """
146
  from pydub import AudioSegment
147
  audio = AudioSegment.from_wav(wav_path)
148
  duration = len(audio) / 1000.0
149
 
150
  diarization = None
151
+ connection_logs: List[Dict[str, Any]] = []
152
+ diar_info: Dict[str, Any] = {"diarization_ok": True, "error": "", "token_source": ""}
153
  try:
154
+ # Para pyannote usamos exclusivamente PYANNOTE_TOKEN (o un token expl铆cito recibido)
155
+ _env_token = os.getenv("PYANNOTE_TOKEN")
156
+ _token = hf_token_env or _env_token
157
+ diar_info["token_source"] = "hf_token_env" if hf_token_env else ("PYANNOTE_TOKEN" if _env_token else "none")
158
+ import time as _t
159
+ t0 = _t.time()
160
  pipeline = Pipeline.from_pretrained(
161
  "pyannote/speaker-diarization-3.1",
162
+ use_auth_token=_token
163
  )
164
+ connection_logs.append({"service": "pyannote", "phase": "connect", "message": "Connecting to pyannote server..."})
165
  diarization = pipeline(wav_path)
166
+ dt = _t.time() - t0
167
+ connection_logs.append({"service": "pyannote", "phase": "done", "message": f"Response from pyannote received in {dt:.2f} s"})
168
  except Exception as e:
169
  log.warning(f"Diarization unavailable, using single full segment fallback: {e}")
170
+ diar_info.update({"diarization_ok": False, "error": str(e)})
171
+ connection_logs.append({"service": "pyannote", "phase": "error", "message": f"pyannote error: {str(e)}"})
172
 
173
  clips_dir = (base_dir / clips_folder)
174
  clips_dir.mkdir(parents=True, exist_ok=True)
 
218
  if not segments:
219
  cp = clips_dir / "segment_000.wav"
220
  audio.export(cp, format="wav")
221
+ # No error here necessarily; could be due to post-filtering thresholds.
222
+ if diar_info.get("error"):
223
+ # already marked
224
+ pass
225
+ else:
226
+ diar_info["reason"] = "no_segments_after_filter"
227
+ return [str(cp)], [{"start": 0.0, "end": duration, "speaker": "SPEAKER_00"}], diar_info, connection_logs
228
 
229
  pairs = sorted(zip(clip_paths, segments), key=lambda x: x[1]["start"])
230
  clip_paths, segments = [p[0] for p in pairs], [p[1] for p in pairs]
231
+ return clip_paths, segments, diar_info, connection_logs
232
 
233
  # ------------------------------ Voice embeddings -----------------------------
234
 
 
416
  out_dir: Path,
417
  cfg: Dict[str, Any],
418
  voice_collection=None,
419
+ ) -> Tuple[List[Dict[str, Any]], Optional[str], str, Dict[str, Any], List[Dict[str, Any]]]:
420
  """
421
  Audio pipeline: FFmpeg -> diarization -> remote ASR (full + clips) -> embeddings -> speaker-ID -> SRT.
422
  Returns (audio_segments, srt_path or None, full_transcription_text).
 
430
  diar_cfg = audio_cfg.get("diarization", {})
431
  min_dur = float(diar_cfg.get("min_segment_duration", 20.0))
432
  max_dur = float(diar_cfg.get("max_segment_duration", 50.0))
433
+ clip_paths, diar_segs, diar_info, connection_logs = diarize_audio(wav_path, out_dir, "clips", min_dur, max_dur)
434
  log.info("Clips de audio generados.")
435
 
436
  full_transcription = ""
437
  asr_section = cfg.get("asr", {})
438
  if asr_section.get("enable_full_transcription", True):
439
  log.info("Transcripci贸n completa (remota, Space 'asr')...")
440
+ import time as _t
441
+ t0 = _t.time()
442
+ connection_logs.append({"service": "asr", "phase": "connect", "message": "Connecting to ASR space..."})
443
  full_res = transcribe_audio_remote(wav_path, cfg)
444
+ dt = _t.time() - t0
445
+ connection_logs.append({"service": "asr", "phase": "done", "message": f"Response from ASR space received in {dt:.2f} s"})
446
  full_transcription = full_res.get("text", "") or ""
447
  log.info("Transcripci贸n completa finalizada.")
448
 
449
  log.info("Transcripci贸n por clip (remota, Space 'asr')...")
450
  trans: List[str] = []
451
  for cp in clip_paths:
452
+ import time as _t
453
+ t0 = _t.time()
454
+ connection_logs.append({"service": "asr", "phase": "connect", "message": "Transcribing clip via ASR space..."})
455
  res = transcribe_audio_remote(cp, cfg)
456
+ dt = _t.time() - t0
457
+ connection_logs.append({"service": "asr", "phase": "done", "message": f"Clip transcribed in {dt:.2f} s"})
458
  trans.append(res.get("text", ""))
459
 
460
  log.info("Se han transcrito todos los clips.")
 
498
  log.warning(f"SRT generation failed: {e}")
499
  srt_unmodified_path = None
500
 
501
+ return audio_segments, srt_unmodified_path, full_transcription, diar_info, connection_logs
llm_router.py CHANGED
@@ -6,6 +6,7 @@ import os
6
  import yaml
7
 
8
  from remote_clients import InstructClient, VisionClient, ToolsClient, ASRClient
 
9
 
10
  def load_yaml(path: str) -> Dict[str, Any]:
11
  p = Path(path)
@@ -36,26 +37,56 @@ class LLMRouter:
36
  "whisper-catalan": mk("whisper-catalan", ASRClient),
37
  }
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  # ---- INSTRUCT ----
40
  def instruct(self, prompt: str, system: Optional[str] = None, model: str = "salamandra-instruct", **kwargs) -> str:
41
  if model in self.rem:
42
- return self.clients[model].generate(prompt, system=system, **kwargs) # type: ignore
 
 
 
 
43
  raise RuntimeError(f"Modelo local no implementado para: {model}")
44
 
45
  # ---- VISION ----
46
  def vision_describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, model: str = "salamandra-vision", **kwargs) -> List[str]:
47
  if model in self.rem:
48
- return self.clients[model].describe(image_paths, context=context, **kwargs) # type: ignore
 
 
 
 
49
  raise RuntimeError(f"Modelo local no implementado para: {model}")
50
 
51
  # ---- TOOLS ----
52
  def chat_with_tools(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, model: str = "salamandra-tools", **kwargs) -> Dict[str, Any]:
53
  if model in self.rem:
54
- return self.clients[model].chat(messages, tools=tools, **kwargs) # type: ignore
 
 
 
 
55
  raise RuntimeError(f"Modelo local no implementado para: {model}")
56
 
57
  # ---- ASR ----
58
  def asr_transcribe(self, audio_path: str, model: str = "whisper-catalan", **kwargs) -> Dict[str, Any]:
59
  if model in self.rem:
60
- return self.clients[model].transcribe(audio_path, **kwargs) # type: ignore
 
 
 
 
61
  raise RuntimeError(f"Modelo local no implementado para: {model}")
 
6
  import yaml
7
 
8
  from remote_clients import InstructClient, VisionClient, ToolsClient, ASRClient
9
+ import time
10
 
11
  def load_yaml(path: str) -> Dict[str, Any]:
12
  p = Path(path)
 
37
  "whisper-catalan": mk("whisper-catalan", ASRClient),
38
  }
39
 
40
+ self.service_names = {
41
+ "salamandra-instruct": "schat",
42
+ "salamandra-vision": "svision",
43
+ "salamandra-tools": "stools",
44
+ "whisper-catalan": "asr",
45
+ }
46
+
47
+ def _log_connect(self, model_key: str, phase: str, elapsed: float | None = None):
48
+ svc = self.service_names.get(model_key, model_key)
49
+ if phase == "connect":
50
+ print(f"[LLMRouter] Connecting to {svc} space...")
51
+ elif phase == "done":
52
+ print(f"[LLMRouter] Response from {svc} space received in {elapsed:.2f} s")
53
+
54
  # ---- INSTRUCT ----
55
  def instruct(self, prompt: str, system: Optional[str] = None, model: str = "salamandra-instruct", **kwargs) -> str:
56
  if model in self.rem:
57
+ self._log_connect(model, "connect")
58
+ t0 = time.time()
59
+ out = self.clients[model].generate(prompt, system=system, **kwargs) # type: ignore
60
+ self._log_connect(model, "done", time.time() - t0)
61
+ return out
62
  raise RuntimeError(f"Modelo local no implementado para: {model}")
63
 
64
  # ---- VISION ----
65
  def vision_describe(self, image_paths: List[str], context: Optional[Dict[str, Any]] = None, model: str = "salamandra-vision", **kwargs) -> List[str]:
66
  if model in self.rem:
67
+ self._log_connect(model, "connect")
68
+ t0 = time.time()
69
+ out = self.clients[model].describe(image_paths, context=context, **kwargs) # type: ignore
70
+ self._log_connect(model, "done", time.time() - t0)
71
+ return out
72
  raise RuntimeError(f"Modelo local no implementado para: {model}")
73
 
74
  # ---- TOOLS ----
75
  def chat_with_tools(self, messages: List[Dict[str, str]], tools: Optional[List[Dict[str, Any]]] = None, model: str = "salamandra-tools", **kwargs) -> Dict[str, Any]:
76
  if model in self.rem:
77
+ self._log_connect(model, "connect")
78
+ t0 = time.time()
79
+ out = self.clients[model].chat(messages, tools=tools, **kwargs) # type: ignore
80
+ self._log_connect(model, "done", time.time() - t0)
81
+ return out
82
  raise RuntimeError(f"Modelo local no implementado para: {model}")
83
 
84
  # ---- ASR ----
85
  def asr_transcribe(self, audio_path: str, model: str = "whisper-catalan", **kwargs) -> Dict[str, Any]:
86
  if model in self.rem:
87
+ self._log_connect(model, "connect")
88
+ t0 = time.time()
89
+ out = self.clients[model].transcribe(audio_path, **kwargs) # type: ignore
90
+ self._log_connect(model, "done", time.time() - t0)
91
+ return out
92
  raise RuntimeError(f"Modelo local no implementado para: {model}")