VeuReu commited on
Commit
6990f06
·
verified ·
1 Parent(s): 9d94fc7

Upload 2 files

Browse files
Files changed (2) hide show
  1. api.py +121 -124
  2. pipelines/audiodescription.py +147 -0
api.py CHANGED
@@ -1,5 +1,3 @@
1
- from pipelines.audiodescription import generate as ad_generate
2
-
3
  from __future__ import annotations
4
  from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks, HTTPException
5
  from fastapi.responses import JSONResponse, FileResponse
@@ -20,6 +18,8 @@ from narration_system import NarrationSystem
20
  from llm_router import load_yaml, LLMRouter
21
  from character_detection import detect_characters_from_video
22
 
 
 
23
  app = FastAPI(title="Veureu Engine API", version="0.2.0")
24
  app.add_middleware(
25
  CORSMiddleware,
@@ -221,125 +221,122 @@ def process_video_job(job_id: str):
221
  "characters": characters,
222
  "num_characters": len(characters),
223
  "analysis_path": analysis_path,
224
- "base_dir": str(base)
225
- }
226
- job["status"] = JobStatus.DONE
227
-
228
- print(f"[{job_id}] DEBUG - job['results'] guardado: {job['results']}")
229
-
230
- except Exception as e_detect:
231
- # Si falla la detección, intentar modo fallback
232
- import traceback
233
- print(f"[{job_id}] ✗ Error en detección: {e_detect}")
234
- print(f"[{job_id}] Traceback: {traceback.format_exc()}")
235
- print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
236
-
237
- # Crear carpetas básicas como fallback
238
- for sub in ("sources", "faces", "voices", "backgrounds"):
239
- (base / sub).mkdir(parents=True, exist_ok=True)
240
-
241
- # Guardar resultados de fallback y luego marcar como completado
242
- job["results"] = {
243
- "characters": [],
244
- "num_characters": 0,
245
- "temp_dirs": {
246
- "sources": str(base / "sources"),
247
- "faces": str(base / "faces"),
248
- "voices": str(base / "voices"),
249
- "backgrounds": str(base / "backgrounds"),
250
- },
251
- "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
252
- }
253
- job["status"] = JobStatus.DONE
254
-
255
- print(f"[{job_id}] ✓ Job completado exitosamente")
256
-
257
- except Exception as e:
258
- print(f"[{job_id}] ✗ Error en el procesamiento: {e}")
259
- jobs[job_id]["status"] = JobStatus.FAILED
260
- jobs[job_id]["error"] = str(e)
261
-
262
- @app.post("/generate_audiodescription")
263
- async def generate_audiodescription(video: UploadFile = File(...)):
264
- try:
265
- import uuid
266
- job_id = str(uuid.uuid4())
267
- vid_name = video.filename or f"video_{job_id}.mp4"
268
- base = BASE_TEMP_DIR / Path(vid_name).stem
269
- base.mkdir(parents=True, exist_ok=True)
270
- # Save temp mp4
271
- video_path = base / vid_name
272
- with open(video_path, "wb") as f:
273
- f.write(await video.read())
274
-
275
- # Run MVP pipeline
276
- result = ad_generate(str(video_path), base)
277
-
278
- return {
279
- "status": "done",
280
- "results": {
281
- "une_srt": result.get("une_srt", ""),
282
- "free_text": result.get("free_text", ""),
283
- "artifacts": result.get("artifacts", {}),
284
- },
285
- }
286
- except Exception as e:
287
- import traceback
288
- print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
289
- raise HTTPException(status_code=500, detail=str(e))
290
-
291
- @app.post("/load_casting")
292
- async def load_casting(
293
- faces_dir: str = Form("identities/faces"),
294
- voices_dir: str = Form("identities/voices"),
295
- db_dir: str = Form("chroma_db"),
296
- drop_collections: bool = Form(False),
297
- ):
298
- client = ensure_chroma(Path(db_dir))
299
- n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
300
- n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
301
- return {"ok": True, "faces": n_faces, "voices": n_voices}
302
-
303
- @app.post("/refine_narration")
304
- async def refine_narration(
305
- dialogues_srt: str = Form(...),
306
- frame_descriptions_json: str = Form("[]"),
307
- config_path: str = Form("config.yaml"),
308
- ):
309
- cfg = load_yaml(config_path)
310
- frames = json.loads(frame_descriptions_json)
311
- model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
312
- use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
313
-
314
- if use_remote:
315
- router = LLMRouter(cfg)
316
- system_msg = (
317
- "Eres un sistema de audiodescripción que cumple UNE-153010. "
318
- "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
319
- "Devuelve JSON con {narrative_text, srt_text}."
320
- )
321
- prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
322
- try:
323
- txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
324
- out = {}
325
- try:
326
- out = json.loads(txt)
327
- except Exception:
328
- out = {"narrative_text": txt, "srt_text": ""}
329
- return {
330
- "narrative_text": out.get("narrative_text", ""),
331
- "srt_text": out.get("srt_text", ""),
332
- "approved": True,
333
- "critic_feedback": "",
334
- }
335
- except Exception:
336
- ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt"))
337
- res = ns.run(dialogues_srt, frames)
338
- return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
339
-
340
- ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
341
- out = ns.run(dialogues_srt, frames)
342
- return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback}
343
-
344
- if __name__ == "__main__":
345
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
1
  from __future__ import annotations
2
  from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks, HTTPException
3
  from fastapi.responses import JSONResponse, FileResponse
 
18
  from llm_router import load_yaml, LLMRouter
19
  from character_detection import detect_characters_from_video
20
 
21
+ from pipelines.audiodescription import generate as ad_generate
22
+
23
  app = FastAPI(title="Veureu Engine API", version="0.2.0")
24
  app.add_middleware(
25
  CORSMiddleware,
 
221
  "characters": characters,
222
  "num_characters": len(characters),
223
  "analysis_path": analysis_path,
224
+ "base_dir": str(base)
225
+ }
226
+ job["status"] = JobStatus.DONE
227
+
228
+ print(f"[{job_id}] DEBUG - job['results'] guardado: {job['results']}")
229
+
230
+ except Exception as e_detect:
231
+ # Si falla la detección, intentar modo fallback
232
+ import traceback
233
+ print(f"[{job_id}] ✗ Error en detección: {e_detect}")
234
+ print(f"[{job_id}] Traceback: {traceback.format_exc()}")
235
+ print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
236
+
237
+ # Crear carpetas básicas como fallback
238
+ for sub in ("sources", "faces", "voices", "backgrounds"):
239
+ (base / sub).mkdir(parents=True, exist_ok=True)
240
+
241
+ # Guardar resultados de fallback y luego marcar como completado
242
+ job["results"] = {
243
+ "characters": [],
244
+ "num_characters": 0,
245
+ "temp_dirs": {
246
+ "sources": str(base / "sources"),
247
+ "faces": str(base / "faces"),
248
+ "voices": str(base / "voices"),
249
+ "backgrounds": str(base / "backgrounds"),
250
+ },
251
+ "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
252
+ }
253
+ job["status"] = JobStatus.DONE
254
+
255
+ print(f"[{job_id}] ✓ Job completado exitosamente")
256
+
257
+
258
+ @app.post("/generate_audiodescription")
259
+ async def generate_audiodescription(video: UploadFile = File(...)):
260
+ try:
261
+ import uuid
262
+ job_id = str(uuid.uuid4())
263
+ vid_name = video.filename or f"video_{job_id}.mp4"
264
+ base = TEMP_ROOT / Path(vid_name).stem
265
+
266
+ base.mkdir(parents=True, exist_ok=True)
267
+ # Save temp mp4
268
+ video_path = base / vid_name
269
+ with open(video_path, "wb") as f:
270
+ f.write(await video.read())
271
+
272
+ # Run MVP pipeline
273
+ result = ad_generate(str(video_path), base)
274
+
275
+ return {
276
+ "status": "done",
277
+ "results": {
278
+ "une_srt": result.get("une_srt", ""),
279
+ "free_text": result.get("free_text", ""),
280
+ "artifacts": result.get("artifacts", {}),
281
+ },
282
+ }
283
+ except Exception as e:
284
+ import traceback
285
+ print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
286
+ raise HTTPException(status_code=500, detail=str(e))
287
+
288
+ @app.post("/load_casting")
289
+ async def load_casting(
290
+ faces_dir: str = Form("identities/faces"),
291
+ voices_dir: str = Form("identities/voices"),
292
+ db_dir: str = Form("chroma_db"),
293
+ drop_collections: bool = Form(False),
294
+ ):
295
+ client = ensure_chroma(Path(db_dir))
296
+ n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
297
+ n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
298
+ return {"ok": True, "faces": n_faces, "voices": n_voices}
299
+
300
+ @app.post("/refine_narration")
301
+ async def refine_narration(
302
+ dialogues_srt: str = Form(...),
303
+ frame_descriptions_json: str = Form("[]"),
304
+ config_path: str = Form("config.yaml"),
305
+ ):
306
+ cfg = load_yaml(config_path)
307
+ frames = json.loads(frame_descriptions_json)
308
+ model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
309
+ use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
310
+
311
+ if use_remote:
312
+ router = LLMRouter(cfg)
313
+ system_msg = (
314
+ "Eres un sistema de audiodescripción que cumple UNE-153010. "
315
+ "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
316
+ "Devuelve JSON con {narrative_text, srt_text}."
317
+ )
318
+ prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
319
+ try:
320
+ txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
321
+ out = {}
322
+ try:
323
+ out = json.loads(txt)
324
+ except Exception:
325
+ out = {"narrative_text": txt, "srt_text": ""}
326
+ return {
327
+ "narrative_text": out.get("narrative_text", ""),
328
+ "srt_text": out.get("srt_text", ""),
329
+ "approved": True,
330
+ "critic_feedback": "",
331
+ }
332
+ except Exception:
333
+ ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt"))
334
+ res = ns.run(dialogues_srt, frames)
335
+ return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
336
+
337
+ ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
338
+ out = ns.run(dialogues_srt, frames)
339
+ return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback}
340
+
341
+ if __name__ == "__main__":
342
+ uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
pipelines/audiodescription.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shlex
5
+ import subprocess
6
+ from pathlib import Path
7
+ from typing import Dict, Any, List, Tuple, Optional
8
+
9
+ # Minimal, robust MVP audio-only pipeline
10
+ # - Extract audio with ffmpeg
11
+ # - Diarize with pyannote (if HF token available); otherwise, fallback: single segment over full duration
12
+ # - ASR with Whisper (AINA if available optional). To keep footprint reasonable and robust,
13
+ # we'll default to a lightweight faster-whisper if present; otherwise, return empty text.
14
+ # - Generate basic SRT from segments and ASR texts.
15
+
16
+
17
+ def extract_audio_ffmpeg(video_path: str, audio_out: Path, sr: int = 16000, mono: bool = True) -> str:
18
+ audio_out.parent.mkdir(parents=True, exist_ok=True)
19
+ cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
20
+ subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
21
+ return str(audio_out)
22
+
23
+
24
+ def _get_video_duration_seconds(video_path: str) -> float:
25
+ try:
26
+ # Use ffprobe to get duration
27
+ cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=nw=1 "{video_path}"'
28
+ out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL).decode("utf-8", errors="ignore")
29
+ for line in out.splitlines():
30
+ if line.startswith("duration="):
31
+ try:
32
+ return float(line.split("=", 1)[1])
33
+ except Exception:
34
+ pass
35
+ except Exception:
36
+ pass
37
+ return 0.0
38
+
39
+
40
+ def diarize_audio(wav_path: str, base_dir: Path, hf_token_env: str | None = None) -> Tuple[List[Dict[str, Any]], List[str]]:
41
+ """Returns segments [{'start','end','speaker'}] and dummy clip_paths (not used in MVP)."""
42
+ segments: List[Dict[str, Any]] = []
43
+ clip_paths: List[str] = []
44
+ # Prefer PYANNOTE_TOKEN if provided; fallback to explicit env name, then HF_TOKEN
45
+ token = os.getenv("PYANNOTE_TOKEN") or (os.getenv(hf_token_env) if hf_token_env else os.getenv("HF_TOKEN"))
46
+ try:
47
+ if token:
48
+ from pyannote.audio import Pipeline # type: ignore
49
+ pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token)
50
+ diarization = pipeline(wav_path)
51
+ # Collect segments
52
+ # We don't export individual clips in MVP; just timestamps.
53
+ for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
54
+ segments.append({
55
+ "start": float(getattr(turn, "start", 0.0) or 0.0),
56
+ "end": float(getattr(turn, "end", 0.0) or 0.0),
57
+ "speaker": str(speaker) if speaker is not None else f"SPEAKER_{i:02d}",
58
+ })
59
+ else:
60
+ # Fallback: single segment using full duration
61
+ # Caller must provide video path to compute exact duration; as we only have wav, skip precise duration
62
+ # and fallback to 0..0 (UI tolerates).
63
+ segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
64
+ except Exception:
65
+ # Robust fallback
66
+ segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
67
+ # Sort by start
68
+ segments = sorted(segments, key=lambda s: s.get("start", 0.0))
69
+ return segments, clip_paths
70
+
71
+
72
+ def _fmt_srt_time(seconds: float) -> str:
73
+ h = int(seconds // 3600)
74
+ m = int((seconds % 3600) // 60)
75
+ s = int(seconds % 60)
76
+ ms = int(round((seconds - int(seconds)) * 1000))
77
+ return f"{h:02}:{m:02}:{s:02},{ms:03}"
78
+
79
+
80
+ def _generate_srt(segments: List[Dict[str, Any]], texts: List[str]) -> str:
81
+ n = min(len(segments), len(texts))
82
+ lines: List[str] = []
83
+ for i in range(n):
84
+ seg = segments[i]
85
+ text = (texts[i] or "").strip()
86
+ start = float(seg.get("start", 0.0))
87
+ end = float(seg.get("end", max(start + 2.0, start)))
88
+ speaker = seg.get("speaker")
89
+ if speaker:
90
+ text = f"[{speaker}]: {text}" if text else f"[{speaker}]"
91
+ lines.append(str(i + 1))
92
+ lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
93
+ lines.append(text)
94
+ lines.append("")
95
+ return "\n".join(lines).strip() + "\n"
96
+
97
+
98
+ def asr_transcribe_wav_simple(wav_path: str) -> str:
99
+ """Very robust ASR stub: try faster-whisper small if present; otherwise return empty text.
100
+ Intended for MVP in Spaces without heavy GPU. """
101
+ try:
102
+ from faster_whisper import WhisperModel # type: ignore
103
+ model = WhisperModel("Systran/faster-whisper-small", device="cpu")
104
+ # Short transcript without timestamps
105
+ segments, info = model.transcribe(wav_path, vad_filter=True, without_timestamps=True, language=None)
106
+ text = " ".join(seg.text.strip() for seg in segments if getattr(seg, "text", None))
107
+ return text.strip()
108
+ except Exception:
109
+ # As last resort, empty text
110
+ return ""
111
+
112
+
113
+ def generate(video_path: str, out_dir: Path) -> Dict[str, Any]:
114
+ """End-to-end MVP that returns {'une_srt','free_text','artifacts':{...}}."""
115
+ out_dir.mkdir(parents=True, exist_ok=True)
116
+ wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.wav")
117
+
118
+ # Diarization (robust)
119
+ segments, _ = diarize_audio(wav_path, out_dir, hf_token_env="HF_TOKEN")
120
+
121
+ # ASR (for MVP: single transcript of full audio to use as 'free_text')
122
+ free_text = asr_transcribe_wav_simple(wav_path)
123
+
124
+ # Build per-segment 'texts' using a simple split of free_text if we have multiple segments
125
+ if not segments:
126
+ segments = [{"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"}]
127
+ texts: List[str] = []
128
+ if len(segments) <= 1:
129
+ texts = [free_text]
130
+ else:
131
+ # Naive split into N parts by words
132
+ words = free_text.split()
133
+ chunk = max(1, len(words) // len(segments))
134
+ for i in range(len(segments)):
135
+ start_idx = i * chunk
136
+ end_idx = (i + 1) * chunk if i < len(segments) - 1 else len(words)
137
+ texts.append(" ".join(words[start_idx:end_idx]))
138
+
139
+ une_srt = _generate_srt(segments, texts)
140
+
141
+ return {
142
+ "une_srt": une_srt,
143
+ "free_text": free_text,
144
+ "artifacts": {
145
+ "wav_path": str(wav_path),
146
+ },
147
+ }