Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 15 days ago

Commit

6990f06

verified ·

1 Parent(s): 9d94fc7

Upload 2 files

Browse files

Files changed (2) hide show

api.py +121 -124
pipelines/audiodescription.py +147 -0

api.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from pipelines.audiodescription import generate as ad_generate
 from __future__ import annotations
 from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks, HTTPException
 from fastapi.responses import JSONResponse, FileResponse
@@ -20,6 +18,8 @@ from narration_system import NarrationSystem
 from llm_router import load_yaml, LLMRouter
 from character_detection import detect_characters_from_video
 app = FastAPI(title="Veureu Engine API", version="0.2.0")
 app.add_middleware(
     CORSMiddleware,
@@ -221,125 +221,122 @@ def process_video_job(job_id: str):
                 "characters": characters,
                 "num_characters": len(characters),
                 "analysis_path": analysis_path,
-                "base_dir": str(base)
-            }
-            job["status"] = JobStatus.DONE
-            print(f"[{job_id}] DEBUG - job['results'] guardado: {job['results']}")
-        except Exception as e_detect:
-            # Si falla la detección, intentar modo fallback
-            import traceback
-            print(f"[{job_id}] ✗ Error en detección: {e_detect}")
-            print(f"[{job_id}] Traceback: {traceback.format_exc()}")
-            print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
-            # Crear carpetas básicas como fallback
-            for sub in ("sources", "faces", "voices", "backgrounds"):
-                (base / sub).mkdir(parents=True, exist_ok=True)
-            # Guardar resultados de fallback y luego marcar como completado
-            job["results"] = {
-                "characters": [],
-                "num_characters": 0,
-                "temp_dirs": {
-                    "sources": str(base / "sources"),
-                    "faces": str(base / "faces"),
-                    "voices": str(base / "voices"),
-                    "backgrounds": str(base / "backgrounds"),
-                },
-                "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
-            }
-            job["status"] = JobStatus.DONE
-        print(f"[{job_id}] ✓ Job completado exitosamente")
-    except Exception as e:
-        print(f"[{job_id}] ✗ Error en el procesamiento: {e}")
-        jobs[job_id]["status"] = JobStatus.FAILED
-        jobs[job_id]["error"] = str(e)
-@app.post("/generate_audiodescription")
-async def generate_audiodescription(video: UploadFile = File(...)):
-    try:
-        import uuid
-        job_id = str(uuid.uuid4())
-        vid_name = video.filename or f"video_{job_id}.mp4"
-        base = BASE_TEMP_DIR / Path(vid_name).stem
-        base.mkdir(parents=True, exist_ok=True)
-        # Save temp mp4
-        video_path = base / vid_name
-        with open(video_path, "wb") as f:
-            f.write(await video.read())
-        # Run MVP pipeline
-        result = ad_generate(str(video_path), base)
-        return {
-            "status": "done",
-            "results": {
-                "une_srt": result.get("une_srt", ""),
-                "free_text": result.get("free_text", ""),
-                "artifacts": result.get("artifacts", {}),
-            },
-        }
-    except Exception as e:
-        import traceback
-        print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/load_casting")
-async def load_casting(
-    faces_dir: str = Form("identities/faces"),
-    voices_dir: str = Form("identities/voices"),
-    db_dir: str = Form("chroma_db"),
-    drop_collections: bool = Form(False),
-):
-    client = ensure_chroma(Path(db_dir))
-    n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
-    n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
-    return {"ok": True, "faces": n_faces, "voices": n_voices}
-@app.post("/refine_narration")
-async def refine_narration(
-    dialogues_srt: str = Form(...),
-    frame_descriptions_json: str = Form("[]"),
-    config_path: str = Form("config.yaml"),
-):
-    cfg = load_yaml(config_path)
-    frames = json.loads(frame_descriptions_json)
-    model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
-    use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
-    if use_remote:
-        router = LLMRouter(cfg)
-        system_msg = (
-            "Eres un sistema de audiodescripción que cumple UNE-153010. "
-            "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
-            "Devuelve JSON con {narrative_text, srt_text}."
-        )
-        prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
-        try:
-            txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
-            out = {}
-            try:
-                out = json.loads(txt)
-            except Exception:
-                out = {"narrative_text": txt, "srt_text": ""}
-            return {
-                "narrative_text": out.get("narrative_text", ""),
-                "srt_text": out.get("srt_text", ""),
-                "approved": True,
-                "critic_feedback": "",
-            }
-        except Exception:
-            ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt"))
-            res = ns.run(dialogues_srt, frames)
-            return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
-    ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
-    out = ns.run(dialogues_srt, frames)
-    return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback}
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from __future__ import annotations
 from fastapi import FastAPI, UploadFile, File, Form, BackgroundTasks, HTTPException
 from fastapi.responses import JSONResponse, FileResponse
 from llm_router import load_yaml, LLMRouter
 from character_detection import detect_characters_from_video
+from pipelines.audiodescription import generate as ad_generate
 app = FastAPI(title="Veureu Engine API", version="0.2.0")
 app.add_middleware(
     CORSMiddleware,
                 "characters": characters,
                 "num_characters": len(characters),
                 "analysis_path": analysis_path,
+                "base_dir": str(base)
+            }
+            job["status"] = JobStatus.DONE
+            print(f"[{job_id}] DEBUG - job['results'] guardado: {job['results']}")
+        except Exception as e_detect:
+            # Si falla la detección, intentar modo fallback
+            import traceback
+            print(f"[{job_id}] ✗ Error en detección: {e_detect}")
+            print(f"[{job_id}] Traceback: {traceback.format_exc()}")
+            print(f"[{job_id}] Usando modo fallback (carpetas vacías)")
+            # Crear carpetas básicas como fallback
+            for sub in ("sources", "faces", "voices", "backgrounds"):
+                (base / sub).mkdir(parents=True, exist_ok=True)
+            # Guardar resultados de fallback y luego marcar como completado
+            job["results"] = {
+                "characters": [],
+                "num_characters": 0,
+                "temp_dirs": {
+                    "sources": str(base / "sources"),
+                    "faces": str(base / "faces"),
+                    "voices": str(base / "voices"),
+                    "backgrounds": str(base / "backgrounds"),
+                },
+                "warning": f"Detección falló, usando modo fallback: {str(e_detect)}"
+            }
+            job["status"] = JobStatus.DONE
+        print(f"[{job_id}] ✓ Job completado exitosamente")
+@app.post("/generate_audiodescription")
+async def generate_audiodescription(video: UploadFile = File(...)):
+    try:
+        import uuid
+        job_id = str(uuid.uuid4())
+        vid_name = video.filename or f"video_{job_id}.mp4"
+        base = TEMP_ROOT / Path(vid_name).stem
+        base.mkdir(parents=True, exist_ok=True)
+        # Save temp mp4
+        video_path = base / vid_name
+        with open(video_path, "wb") as f:
+            f.write(await video.read())
+        # Run MVP pipeline
+        result = ad_generate(str(video_path), base)
+        return {
+            "status": "done",
+            "results": {
+                "une_srt": result.get("une_srt", ""),
+                "free_text": result.get("free_text", ""),
+                "artifacts": result.get("artifacts", {}),
+            },
+        }
+    except Exception as e:
+        import traceback
+        print(f"/generate_audiodescription error: {e}\n{traceback.format_exc()}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/load_casting")
+async def load_casting(
+    faces_dir: str = Form("identities/faces"),
+    voices_dir: str = Form("identities/voices"),
+    db_dir: str = Form("chroma_db"),
+    drop_collections: bool = Form(False),
+):
+    client = ensure_chroma(Path(db_dir))
+    n_faces = build_faces_index(Path(faces_dir), client, collection_name="index_faces", drop=drop_collections)
+    n_voices = build_voices_index(Path(voices_dir), client, collection_name="index_voices", drop=drop_collections)
+    return {"ok": True, "faces": n_faces, "voices": n_voices}
+@app.post("/refine_narration")
+async def refine_narration(
+    dialogues_srt: str = Form(...),
+    frame_descriptions_json: str = Form("[]"),
+    config_path: str = Form("config.yaml"),
+):
+    cfg = load_yaml(config_path)
+    frames = json.loads(frame_descriptions_json)
+    model_name = cfg.get("narration", {}).get("model", "salamandra-instruct")
+    use_remote = model_name in (cfg.get("models", {}).get("routing", {}).get("use_remote_for", []))
+    if use_remote:
+        router = LLMRouter(cfg)
+        system_msg = (
+            "Eres un sistema de audiodescripción que cumple UNE-153010. "
+            "Fusiona diálogos del SRT con descripciones concisas en los huecos, evitando redundancias. "
+            "Devuelve JSON con {narrative_text, srt_text}."
+        )
+        prompt = json.dumps({"dialogues_srt": dialogues_srt, "frames": frames, "rules": cfg.get("narration", {})}, ensure_ascii=False)
+        try:
+            txt = router.instruct(prompt=prompt, system=system_msg, model=model_name)
+            out = {}
+            try:
+                out = json.loads(txt)
+            except Exception:
+                out = {"narrative_text": txt, "srt_text": ""}
+            return {
+                "narrative_text": out.get("narrative_text", ""),
+                "srt_text": out.get("srt_text", ""),
+                "approved": True,
+                "critic_feedback": "",
+            }
+        except Exception:
+            ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("narration_une_guidelines_path", "UNE_153010.txt"))
+            res = ns.run(dialogues_srt, frames)
+            return {"narrative_text": res.narrative_text, "srt_text": res.srt_text, "approved": res.approved, "critic_feedback": res.critic_feedback}
+    ns = NarrationSystem(model_url=None, une_guidelines_path=cfg.get("narration", {}).get("une_guidelines_path", "UNE_153010.txt"))
+    out = ns.run(dialogues_srt, frames)
+    return {"narrative_text": out.narrative_text, "srt_text": out.srt_text, "approved": out.approved, "critic_feedback": out.critic_feedback}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

pipelines/audiodescription.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from __future__ import annotations
+import os
+import shlex
+import subprocess
+from pathlib import Path
+from typing import Dict, Any, List, Tuple, Optional
+# Minimal, robust MVP audio-only pipeline
+# - Extract audio with ffmpeg
+# - Diarize with pyannote (if HF token available); otherwise, fallback: single segment over full duration
+# - ASR with Whisper (AINA if available optional). To keep footprint reasonable and robust,
+#   we'll default to a lightweight faster-whisper if present; otherwise, return empty text.
+# - Generate basic SRT from segments and ASR texts.
+def extract_audio_ffmpeg(video_path: str, audio_out: Path, sr: int = 16000, mono: bool = True) -> str:
+    audio_out.parent.mkdir(parents=True, exist_ok=True)
+    cmd = f'ffmpeg -y -i "{video_path}" -vn {"-ac 1" if mono else ""} -ar {sr} -f wav "{audio_out}"'
+    subprocess.run(shlex.split(cmd), check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    return str(audio_out)
+def _get_video_duration_seconds(video_path: str) -> float:
+    try:
+        # Use ffprobe to get duration
+        cmd = f'ffprobe -v error -select_streams v:0 -show_entries stream=duration -of default=nw=1 "{video_path}"'
+        out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL).decode("utf-8", errors="ignore")
+        for line in out.splitlines():
+            if line.startswith("duration="):
+                try:
+                    return float(line.split("=", 1)[1])
+                except Exception:
+                    pass
+    except Exception:
+        pass
+    return 0.0
+def diarize_audio(wav_path: str, base_dir: Path, hf_token_env: str | None = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+    """Returns segments [{'start','end','speaker'}] and dummy clip_paths (not used in MVP)."""
+    segments: List[Dict[str, Any]] = []
+    clip_paths: List[str] = []
+    # Prefer PYANNOTE_TOKEN if provided; fallback to explicit env name, then HF_TOKEN
+    token = os.getenv("PYANNOTE_TOKEN") or (os.getenv(hf_token_env) if hf_token_env else os.getenv("HF_TOKEN"))
+    try:
+        if token:
+            from pyannote.audio import Pipeline  # type: ignore
+            pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token=token)
+            diarization = pipeline(wav_path)
+            # Collect segments
+            # We don't export individual clips in MVP; just timestamps.
+            for i, (turn, _, speaker) in enumerate(diarization.itertracks(yield_label=True)):
+                segments.append({
+                    "start": float(getattr(turn, "start", 0.0) or 0.0),
+                    "end": float(getattr(turn, "end", 0.0) or 0.0),
+                    "speaker": str(speaker) if speaker is not None else f"SPEAKER_{i:02d}",
+                })
+        else:
+            # Fallback: single segment using full duration
+            # Caller must provide video path to compute exact duration; as we only have wav, skip precise duration
+            # and fallback to 0..0 (UI tolerates).
+            segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
+    except Exception:
+        # Robust fallback
+        segments.append({"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"})
+    # Sort by start
+    segments = sorted(segments, key=lambda s: s.get("start", 0.0))
+    return segments, clip_paths
+def _fmt_srt_time(seconds: float) -> str:
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    ms = int(round((seconds - int(seconds)) * 1000))
+    return f"{h:02}:{m:02}:{s:02},{ms:03}"
+def _generate_srt(segments: List[Dict[str, Any]], texts: List[str]) -> str:
+    n = min(len(segments), len(texts))
+    lines: List[str] = []
+    for i in range(n):
+        seg = segments[i]
+        text = (texts[i] or "").strip()
+        start = float(seg.get("start", 0.0))
+        end = float(seg.get("end", max(start + 2.0, start)))
+        speaker = seg.get("speaker")
+        if speaker:
+            text = f"[{speaker}]: {text}" if text else f"[{speaker}]"
+        lines.append(str(i + 1))
+        lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
+        lines.append(text)
+        lines.append("")
+    return "\n".join(lines).strip() + "\n"
+def asr_transcribe_wav_simple(wav_path: str) -> str:
+    """Very robust ASR stub: try faster-whisper small if present; otherwise return empty text.
+    Intended for MVP in Spaces without heavy GPU. """
+    try:
+        from faster_whisper import WhisperModel  # type: ignore
+        model = WhisperModel("Systran/faster-whisper-small", device="cpu")
+        # Short transcript without timestamps
+        segments, info = model.transcribe(wav_path, vad_filter=True, without_timestamps=True, language=None)
+        text = " ".join(seg.text.strip() for seg in segments if getattr(seg, "text", None))
+        return text.strip()
+    except Exception:
+        # As last resort, empty text
+        return ""
+def generate(video_path: str, out_dir: Path) -> Dict[str, Any]:
+    """End-to-end MVP that returns {'une_srt','free_text','artifacts':{...}}."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+    wav_path = extract_audio_ffmpeg(video_path, out_dir / f"{Path(video_path).stem}.wav")
+    # Diarization (robust)
+    segments, _ = diarize_audio(wav_path, out_dir, hf_token_env="HF_TOKEN")
+    # ASR (for MVP: single transcript of full audio to use as 'free_text')
+    free_text = asr_transcribe_wav_simple(wav_path)
+    # Build per-segment 'texts' using a simple split of free_text if we have multiple segments
+    if not segments:
+        segments = [{"start": 0.0, "end": 0.0, "speaker": "SPEAKER_00"}]
+    texts: List[str] = []
+    if len(segments) <= 1:
+        texts = [free_text]
+    else:
+        # Naive split into N parts by words
+        words = free_text.split()
+        chunk = max(1, len(words) // len(segments))
+        for i in range(len(segments)):
+            start_idx = i * chunk
+            end_idx = (i + 1) * chunk if i < len(segments) - 1 else len(words)
+            texts.append(" ".join(words[start_idx:end_idx]))
+    une_srt = _generate_srt(segments, texts)
+    return {
+        "une_srt": une_srt,
+        "free_text": free_text,
+        "artifacts": {
+            "wav_path": str(wav_path),
+        },
+    }