Upload 2 files
Browse files- audio_tools.py +141 -8
- config.yaml +3 -0
audio_tools.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
# -----------------------------------------------------------------------------
|
| 3 |
# Veureu — AUDIO utilities (orchestrator w/ remote ASR)
|
| 4 |
# - FFmpeg extraction (WAV)
|
| 5 |
-
# - Diarization (pyannote)
|
| 6 |
# - Voice embeddings (SpeechBrain ECAPA) [local]
|
| 7 |
# - Speaker identification (KMeans + ChromaDB optional) [local]
|
| 8 |
# - ASR: delegated to HF Space `veureu/asr` (faster-whisper-large-v3-ca-3catparla)
|
|
@@ -35,8 +35,13 @@ except Exception:
|
|
| 35 |
|
| 36 |
import soundfile as sf
|
| 37 |
|
| 38 |
-
# Pyannote for diarization (local)
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# Speaker embeddings (local)
|
| 42 |
from speechbrain.inference.speaker import SpeakerRecognition # v1.0+
|
|
@@ -143,6 +148,93 @@ def transcribe_audio_remote(audio_path: str | Path, cfg: Dict[str, Any]) -> Dict
|
|
| 143 |
|
| 144 |
# -------------------------------- Diarization --------------------------------
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
def diarize_audio(
|
| 147 |
wav_path: str,
|
| 148 |
base_dir: Path,
|
|
@@ -150,10 +242,33 @@ def diarize_audio(
|
|
| 150 |
min_segment_duration: float = 20.0,
|
| 151 |
max_segment_duration: float = 50.0,
|
| 152 |
hf_token_env: str | None = None,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
|
| 154 |
-
"""Diarization with pyannote and clip export with pydub.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
Returns (clip_paths, segments, info) where info includes diarization_ok and optional error.
|
| 156 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
from pydub import AudioSegment
|
| 158 |
audio = AudioSegment.from_wav(wav_path)
|
| 159 |
duration = len(audio) / 1000.0
|
|
@@ -177,9 +292,18 @@ def diarize_audio(
|
|
| 177 |
dt = _t.time() - t0
|
| 178 |
connection_logs.append({"service": "pyannote", "phase": "done", "message": f"Response from pyannote received in {dt:.2f} s"})
|
| 179 |
except Exception as e:
|
| 180 |
-
log.warning(f"Diarization unavailable
|
| 181 |
diar_info.update({"diarization_ok": False, "error": str(e)})
|
| 182 |
connection_logs.append({"service": "pyannote", "phase": "error", "message": f"pyannote error: {str(e)}"})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
|
| 184 |
clips_dir = (base_dir / clips_folder)
|
| 185 |
clips_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -449,9 +573,18 @@ def process_audio_for_video(
|
|
| 449 |
log.info("Audio extraído")
|
| 450 |
|
| 451 |
diar_cfg = audio_cfg.get("diarization", {})
|
| 452 |
-
min_dur = float(diar_cfg.get("min_segment_duration",
|
| 453 |
-
max_dur = float(diar_cfg.get("max_segment_duration",
|
| 454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 455 |
log.info("Clips de audio generados.")
|
| 456 |
|
| 457 |
full_transcription = ""
|
|
|
|
| 2 |
# -----------------------------------------------------------------------------
|
| 3 |
# Veureu — AUDIO utilities (orchestrator w/ remote ASR)
|
| 4 |
# - FFmpeg extraction (WAV)
|
| 5 |
+
# - Diarization (pyannote or silence-based fallback) [local]
|
| 6 |
# - Voice embeddings (SpeechBrain ECAPA) [local]
|
| 7 |
# - Speaker identification (KMeans + ChromaDB optional) [local]
|
| 8 |
# - ASR: delegated to HF Space `veureu/asr` (faster-whisper-large-v3-ca-3catparla)
|
|
|
|
| 35 |
|
| 36 |
import soundfile as sf
|
| 37 |
|
| 38 |
+
# Pyannote for diarization (local) - optional
|
| 39 |
+
try:
|
| 40 |
+
from pyannote.audio import Pipeline
|
| 41 |
+
HAS_PYANNOTE = True
|
| 42 |
+
except Exception:
|
| 43 |
+
Pipeline = None # type: ignore
|
| 44 |
+
HAS_PYANNOTE = False
|
| 45 |
|
| 46 |
# Speaker embeddings (local)
|
| 47 |
from speechbrain.inference.speaker import SpeakerRecognition # v1.0+
|
|
|
|
| 148 |
|
| 149 |
# -------------------------------- Diarization --------------------------------
|
| 150 |
|
| 151 |
+
def diarize_audio_silence_based(
|
| 152 |
+
wav_path: str,
|
| 153 |
+
base_dir: Path,
|
| 154 |
+
clips_folder: str = "clips",
|
| 155 |
+
min_segment_duration: float = 20.0,
|
| 156 |
+
max_segment_duration: float = 50.0,
|
| 157 |
+
silence_thresh: int = -40,
|
| 158 |
+
min_silence_len: int = 500,
|
| 159 |
+
) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
|
| 160 |
+
"""Segmentation based on silence detection (alternative to pyannote).
|
| 161 |
+
Returns (clip_paths, segments, info, connection_logs) in same format as diarize_audio.
|
| 162 |
+
"""
|
| 163 |
+
from pydub import AudioSegment
|
| 164 |
+
from pydub.silence import detect_nonsilent
|
| 165 |
+
|
| 166 |
+
audio = AudioSegment.from_wav(wav_path)
|
| 167 |
+
duration = len(audio) / 1000.0
|
| 168 |
+
|
| 169 |
+
# Detect non-silent chunks
|
| 170 |
+
nonsilent_ranges = detect_nonsilent(
|
| 171 |
+
audio,
|
| 172 |
+
min_silence_len=min_silence_len,
|
| 173 |
+
silence_thresh=silence_thresh
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
clips_dir = (base_dir / clips_folder)
|
| 177 |
+
clips_dir.mkdir(parents=True, exist_ok=True)
|
| 178 |
+
clip_paths: List[str] = []
|
| 179 |
+
segments: List[Dict[str, Any]] = []
|
| 180 |
+
|
| 181 |
+
for idx, (start_ms, end_ms) in enumerate(nonsilent_ranges):
|
| 182 |
+
start = start_ms / 1000.0
|
| 183 |
+
end = end_ms / 1000.0
|
| 184 |
+
seg_dur = end - start
|
| 185 |
+
|
| 186 |
+
# Filter by minimum duration
|
| 187 |
+
if seg_dur < min_segment_duration:
|
| 188 |
+
continue
|
| 189 |
+
|
| 190 |
+
# Split long segments
|
| 191 |
+
if seg_dur > max_segment_duration:
|
| 192 |
+
n = int(math.ceil(seg_dur / max_segment_duration))
|
| 193 |
+
sub_d = seg_dur / n
|
| 194 |
+
for j in range(n):
|
| 195 |
+
s = start + j * sub_d
|
| 196 |
+
e = min(end, start + (j + 1) * sub_d)
|
| 197 |
+
if e <= s:
|
| 198 |
+
continue
|
| 199 |
+
clip = audio[int(s * 1000):int(e * 1000)]
|
| 200 |
+
cp = clips_dir / f"segment_{idx:03d}_{j:02d}.wav"
|
| 201 |
+
clip.export(cp, format="wav")
|
| 202 |
+
segments.append({"start": s, "end": e, "speaker": "UNKNOWN"})
|
| 203 |
+
clip_paths.append(str(cp))
|
| 204 |
+
else:
|
| 205 |
+
clip = audio[start_ms:end_ms]
|
| 206 |
+
cp = clips_dir / f"segment_{idx:03d}.wav"
|
| 207 |
+
clip.export(cp, format="wav")
|
| 208 |
+
segments.append({"start": start, "end": end, "speaker": "UNKNOWN"})
|
| 209 |
+
clip_paths.append(str(cp))
|
| 210 |
+
|
| 211 |
+
# Fallback: if no segments, use full audio
|
| 212 |
+
if not segments:
|
| 213 |
+
cp = clips_dir / "segment_000.wav"
|
| 214 |
+
audio.export(cp, format="wav")
|
| 215 |
+
return (
|
| 216 |
+
[str(cp)],
|
| 217 |
+
[{"start": 0.0, "end": duration, "speaker": "UNKNOWN"}],
|
| 218 |
+
{"diarization_ok": False, "error": "no_segments_after_silence_filter", "token_source": "silence-based"},
|
| 219 |
+
[{"service": "silence-detection", "phase": "done", "message": "Segmentation by silence completed"}]
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
diar_info = {
|
| 223 |
+
"diarization_ok": True,
|
| 224 |
+
"error": "",
|
| 225 |
+
"token_source": "silence-based",
|
| 226 |
+
"method": "silence-detection",
|
| 227 |
+
"num_segments": len(segments)
|
| 228 |
+
}
|
| 229 |
+
connection_logs = [{
|
| 230 |
+
"service": "silence-detection",
|
| 231 |
+
"phase": "done",
|
| 232 |
+
"message": f"Segmented audio into {len(segments)} clips based on silence"
|
| 233 |
+
}]
|
| 234 |
+
|
| 235 |
+
return clip_paths, segments, diar_info, connection_logs
|
| 236 |
+
|
| 237 |
+
|
| 238 |
def diarize_audio(
|
| 239 |
wav_path: str,
|
| 240 |
base_dir: Path,
|
|
|
|
| 242 |
min_segment_duration: float = 20.0,
|
| 243 |
max_segment_duration: float = 50.0,
|
| 244 |
hf_token_env: str | None = None,
|
| 245 |
+
use_silence_fallback: bool = True,
|
| 246 |
+
force_silence_only: bool = False,
|
| 247 |
+
silence_thresh: int = -40,
|
| 248 |
+
min_silence_len: int = 500,
|
| 249 |
) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
|
| 250 |
+
"""Diarization with pyannote (or silence-based fallback) and clip export with pydub.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
force_silence_only: If True, skip pyannote and use silence-based segmentation directly.
|
| 254 |
+
use_silence_fallback: If True and pyannote fails, use silence-based segmentation.
|
| 255 |
+
silence_thresh: dBFS threshold for silence detection (default -40).
|
| 256 |
+
min_silence_len: Minimum silence length in milliseconds (default 500).
|
| 257 |
+
|
| 258 |
Returns (clip_paths, segments, info) where info includes diarization_ok and optional error.
|
| 259 |
"""
|
| 260 |
+
# If forced to use silence-only or pyannote not available, use silence-based directly
|
| 261 |
+
if force_silence_only or not HAS_PYANNOTE:
|
| 262 |
+
if not HAS_PYANNOTE:
|
| 263 |
+
log.info("pyannote not available, using silence-based segmentation")
|
| 264 |
+
else:
|
| 265 |
+
log.info("Using silence-based segmentation (forced)")
|
| 266 |
+
return diarize_audio_silence_based(
|
| 267 |
+
wav_path, base_dir, clips_folder,
|
| 268 |
+
min_segment_duration, max_segment_duration,
|
| 269 |
+
silence_thresh, min_silence_len
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
from pydub import AudioSegment
|
| 273 |
audio = AudioSegment.from_wav(wav_path)
|
| 274 |
duration = len(audio) / 1000.0
|
|
|
|
| 292 |
dt = _t.time() - t0
|
| 293 |
connection_logs.append({"service": "pyannote", "phase": "done", "message": f"Response from pyannote received in {dt:.2f} s"})
|
| 294 |
except Exception as e:
|
| 295 |
+
log.warning(f"Diarization unavailable: {e}")
|
| 296 |
diar_info.update({"diarization_ok": False, "error": str(e)})
|
| 297 |
connection_logs.append({"service": "pyannote", "phase": "error", "message": f"pyannote error: {str(e)}"})
|
| 298 |
+
|
| 299 |
+
# Try silence-based segmentation as fallback
|
| 300 |
+
if use_silence_fallback:
|
| 301 |
+
log.info("Attempting silence-based segmentation as fallback...")
|
| 302 |
+
return diarize_audio_silence_based(
|
| 303 |
+
wav_path, base_dir, clips_folder,
|
| 304 |
+
min_segment_duration, max_segment_duration,
|
| 305 |
+
silence_thresh, min_silence_len
|
| 306 |
+
)
|
| 307 |
|
| 308 |
clips_dir = (base_dir / clips_folder)
|
| 309 |
clips_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 573 |
log.info("Audio extraído")
|
| 574 |
|
| 575 |
diar_cfg = audio_cfg.get("diarization", {})
|
| 576 |
+
min_dur = float(diar_cfg.get("min_segment_duration", 0.5))
|
| 577 |
+
max_dur = float(diar_cfg.get("max_segment_duration", 10.0))
|
| 578 |
+
force_silence = bool(diar_cfg.get("force_silence_only", True)) # Default to silence-based
|
| 579 |
+
silence_thresh = int(diar_cfg.get("silence_thresh", -40))
|
| 580 |
+
min_silence_len = int(diar_cfg.get("min_silence_len", 500))
|
| 581 |
+
|
| 582 |
+
clip_paths, diar_segs, diar_info, connection_logs = diarize_audio(
|
| 583 |
+
wav_path, out_dir, "clips", min_dur, max_dur,
|
| 584 |
+
force_silence_only=force_silence,
|
| 585 |
+
silence_thresh=silence_thresh,
|
| 586 |
+
min_silence_len=min_silence_len
|
| 587 |
+
)
|
| 588 |
log.info("Clips de audio generados.")
|
| 589 |
|
| 590 |
full_transcription = ""
|
config.yaml
CHANGED
|
@@ -55,8 +55,11 @@ audio_processing:
|
|
| 55 |
|
| 56 |
diarization:
|
| 57 |
enabled: true
|
|
|
|
| 58 |
min_segment_duration: 0.5 # en segundos (clips cortos)
|
| 59 |
max_segment_duration: 10.0
|
|
|
|
|
|
|
| 60 |
|
| 61 |
enable_voice_embeddings: true # SpeechBrain ECAPA
|
| 62 |
speaker_embedding:
|
|
|
|
| 55 |
|
| 56 |
diarization:
|
| 57 |
enabled: true
|
| 58 |
+
force_silence_only: true # Use silence-based segmentation (no pyannote)
|
| 59 |
min_segment_duration: 0.5 # en segundos (clips cortos)
|
| 60 |
max_segment_duration: 10.0
|
| 61 |
+
silence_thresh: -40 # dBFS threshold for silence detection
|
| 62 |
+
min_silence_len: 500 # milliseconds
|
| 63 |
|
| 64 |
enable_voice_embeddings: true # SpeechBrain ECAPA
|
| 65 |
speaker_embedding:
|