VeuReu commited on
Commit
aa81525
·
verified ·
1 Parent(s): 87c8f64

Upload 2 files

Browse files
Files changed (2) hide show
  1. audio_tools.py +141 -8
  2. config.yaml +3 -0
audio_tools.py CHANGED
@@ -2,7 +2,7 @@
2
  # -----------------------------------------------------------------------------
3
  # Veureu — AUDIO utilities (orchestrator w/ remote ASR)
4
  # - FFmpeg extraction (WAV)
5
- # - Diarization (pyannote) [local]
6
  # - Voice embeddings (SpeechBrain ECAPA) [local]
7
  # - Speaker identification (KMeans + ChromaDB optional) [local]
8
  # - ASR: delegated to HF Space `veureu/asr` (faster-whisper-large-v3-ca-3catparla)
@@ -35,8 +35,13 @@ except Exception:
35
 
36
  import soundfile as sf
37
 
38
- # Pyannote for diarization (local)
39
- from pyannote.audio import Pipeline
 
 
 
 
 
40
 
41
  # Speaker embeddings (local)
42
  from speechbrain.inference.speaker import SpeakerRecognition # v1.0+
@@ -143,6 +148,93 @@ def transcribe_audio_remote(audio_path: str | Path, cfg: Dict[str, Any]) -> Dict
143
 
144
  # -------------------------------- Diarization --------------------------------
145
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  def diarize_audio(
147
  wav_path: str,
148
  base_dir: Path,
@@ -150,10 +242,33 @@ def diarize_audio(
150
  min_segment_duration: float = 20.0,
151
  max_segment_duration: float = 50.0,
152
  hf_token_env: str | None = None,
 
 
 
 
153
  ) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
154
- """Diarization with pyannote and clip export with pydub.
 
 
 
 
 
 
 
155
  Returns (clip_paths, segments, info) where info includes diarization_ok and optional error.
156
  """
 
 
 
 
 
 
 
 
 
 
 
 
157
  from pydub import AudioSegment
158
  audio = AudioSegment.from_wav(wav_path)
159
  duration = len(audio) / 1000.0
@@ -177,9 +292,18 @@ def diarize_audio(
177
  dt = _t.time() - t0
178
  connection_logs.append({"service": "pyannote", "phase": "done", "message": f"Response from pyannote received in {dt:.2f} s"})
179
  except Exception as e:
180
- log.warning(f"Diarization unavailable, using single full segment fallback: {e}")
181
  diar_info.update({"diarization_ok": False, "error": str(e)})
182
  connection_logs.append({"service": "pyannote", "phase": "error", "message": f"pyannote error: {str(e)}"})
 
 
 
 
 
 
 
 
 
183
 
184
  clips_dir = (base_dir / clips_folder)
185
  clips_dir.mkdir(parents=True, exist_ok=True)
@@ -449,9 +573,18 @@ def process_audio_for_video(
449
  log.info("Audio extraído")
450
 
451
  diar_cfg = audio_cfg.get("diarization", {})
452
- min_dur = float(diar_cfg.get("min_segment_duration", 20.0))
453
- max_dur = float(diar_cfg.get("max_segment_duration", 50.0))
454
- clip_paths, diar_segs, diar_info, connection_logs = diarize_audio(wav_path, out_dir, "clips", min_dur, max_dur)
 
 
 
 
 
 
 
 
 
455
  log.info("Clips de audio generados.")
456
 
457
  full_transcription = ""
 
2
  # -----------------------------------------------------------------------------
3
  # Veureu — AUDIO utilities (orchestrator w/ remote ASR)
4
  # - FFmpeg extraction (WAV)
5
+ # - Diarization (pyannote or silence-based fallback) [local]
6
  # - Voice embeddings (SpeechBrain ECAPA) [local]
7
  # - Speaker identification (KMeans + ChromaDB optional) [local]
8
  # - ASR: delegated to HF Space `veureu/asr` (faster-whisper-large-v3-ca-3catparla)
 
35
 
36
  import soundfile as sf
37
 
38
+ # Pyannote for diarization (local) - optional
39
+ try:
40
+ from pyannote.audio import Pipeline
41
+ HAS_PYANNOTE = True
42
+ except Exception:
43
+ Pipeline = None # type: ignore
44
+ HAS_PYANNOTE = False
45
 
46
  # Speaker embeddings (local)
47
  from speechbrain.inference.speaker import SpeakerRecognition # v1.0+
 
148
 
149
  # -------------------------------- Diarization --------------------------------
150
 
151
+ def diarize_audio_silence_based(
152
+ wav_path: str,
153
+ base_dir: Path,
154
+ clips_folder: str = "clips",
155
+ min_segment_duration: float = 20.0,
156
+ max_segment_duration: float = 50.0,
157
+ silence_thresh: int = -40,
158
+ min_silence_len: int = 500,
159
+ ) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
160
+ """Segmentation based on silence detection (alternative to pyannote).
161
+ Returns (clip_paths, segments, info, connection_logs) in same format as diarize_audio.
162
+ """
163
+ from pydub import AudioSegment
164
+ from pydub.silence import detect_nonsilent
165
+
166
+ audio = AudioSegment.from_wav(wav_path)
167
+ duration = len(audio) / 1000.0
168
+
169
+ # Detect non-silent chunks
170
+ nonsilent_ranges = detect_nonsilent(
171
+ audio,
172
+ min_silence_len=min_silence_len,
173
+ silence_thresh=silence_thresh
174
+ )
175
+
176
+ clips_dir = (base_dir / clips_folder)
177
+ clips_dir.mkdir(parents=True, exist_ok=True)
178
+ clip_paths: List[str] = []
179
+ segments: List[Dict[str, Any]] = []
180
+
181
+ for idx, (start_ms, end_ms) in enumerate(nonsilent_ranges):
182
+ start = start_ms / 1000.0
183
+ end = end_ms / 1000.0
184
+ seg_dur = end - start
185
+
186
+ # Filter by minimum duration
187
+ if seg_dur < min_segment_duration:
188
+ continue
189
+
190
+ # Split long segments
191
+ if seg_dur > max_segment_duration:
192
+ n = int(math.ceil(seg_dur / max_segment_duration))
193
+ sub_d = seg_dur / n
194
+ for j in range(n):
195
+ s = start + j * sub_d
196
+ e = min(end, start + (j + 1) * sub_d)
197
+ if e <= s:
198
+ continue
199
+ clip = audio[int(s * 1000):int(e * 1000)]
200
+ cp = clips_dir / f"segment_{idx:03d}_{j:02d}.wav"
201
+ clip.export(cp, format="wav")
202
+ segments.append({"start": s, "end": e, "speaker": "UNKNOWN"})
203
+ clip_paths.append(str(cp))
204
+ else:
205
+ clip = audio[start_ms:end_ms]
206
+ cp = clips_dir / f"segment_{idx:03d}.wav"
207
+ clip.export(cp, format="wav")
208
+ segments.append({"start": start, "end": end, "speaker": "UNKNOWN"})
209
+ clip_paths.append(str(cp))
210
+
211
+ # Fallback: if no segments, use full audio
212
+ if not segments:
213
+ cp = clips_dir / "segment_000.wav"
214
+ audio.export(cp, format="wav")
215
+ return (
216
+ [str(cp)],
217
+ [{"start": 0.0, "end": duration, "speaker": "UNKNOWN"}],
218
+ {"diarization_ok": False, "error": "no_segments_after_silence_filter", "token_source": "silence-based"},
219
+ [{"service": "silence-detection", "phase": "done", "message": "Segmentation by silence completed"}]
220
+ )
221
+
222
+ diar_info = {
223
+ "diarization_ok": True,
224
+ "error": "",
225
+ "token_source": "silence-based",
226
+ "method": "silence-detection",
227
+ "num_segments": len(segments)
228
+ }
229
+ connection_logs = [{
230
+ "service": "silence-detection",
231
+ "phase": "done",
232
+ "message": f"Segmented audio into {len(segments)} clips based on silence"
233
+ }]
234
+
235
+ return clip_paths, segments, diar_info, connection_logs
236
+
237
+
238
  def diarize_audio(
239
  wav_path: str,
240
  base_dir: Path,
 
242
  min_segment_duration: float = 20.0,
243
  max_segment_duration: float = 50.0,
244
  hf_token_env: str | None = None,
245
+ use_silence_fallback: bool = True,
246
+ force_silence_only: bool = False,
247
+ silence_thresh: int = -40,
248
+ min_silence_len: int = 500,
249
  ) -> Tuple[List[str], List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
250
+ """Diarization with pyannote (or silence-based fallback) and clip export with pydub.
251
+
252
+ Args:
253
+ force_silence_only: If True, skip pyannote and use silence-based segmentation directly.
254
+ use_silence_fallback: If True and pyannote fails, use silence-based segmentation.
255
+ silence_thresh: dBFS threshold for silence detection (default -40).
256
+ min_silence_len: Minimum silence length in milliseconds (default 500).
257
+
258
  Returns (clip_paths, segments, info) where info includes diarization_ok and optional error.
259
  """
260
+ # If forced to use silence-only or pyannote not available, use silence-based directly
261
+ if force_silence_only or not HAS_PYANNOTE:
262
+ if not HAS_PYANNOTE:
263
+ log.info("pyannote not available, using silence-based segmentation")
264
+ else:
265
+ log.info("Using silence-based segmentation (forced)")
266
+ return diarize_audio_silence_based(
267
+ wav_path, base_dir, clips_folder,
268
+ min_segment_duration, max_segment_duration,
269
+ silence_thresh, min_silence_len
270
+ )
271
+
272
  from pydub import AudioSegment
273
  audio = AudioSegment.from_wav(wav_path)
274
  duration = len(audio) / 1000.0
 
292
  dt = _t.time() - t0
293
  connection_logs.append({"service": "pyannote", "phase": "done", "message": f"Response from pyannote received in {dt:.2f} s"})
294
  except Exception as e:
295
+ log.warning(f"Diarization unavailable: {e}")
296
  diar_info.update({"diarization_ok": False, "error": str(e)})
297
  connection_logs.append({"service": "pyannote", "phase": "error", "message": f"pyannote error: {str(e)}"})
298
+
299
+ # Try silence-based segmentation as fallback
300
+ if use_silence_fallback:
301
+ log.info("Attempting silence-based segmentation as fallback...")
302
+ return diarize_audio_silence_based(
303
+ wav_path, base_dir, clips_folder,
304
+ min_segment_duration, max_segment_duration,
305
+ silence_thresh, min_silence_len
306
+ )
307
 
308
  clips_dir = (base_dir / clips_folder)
309
  clips_dir.mkdir(parents=True, exist_ok=True)
 
573
  log.info("Audio extraído")
574
 
575
  diar_cfg = audio_cfg.get("diarization", {})
576
+ min_dur = float(diar_cfg.get("min_segment_duration", 0.5))
577
+ max_dur = float(diar_cfg.get("max_segment_duration", 10.0))
578
+ force_silence = bool(diar_cfg.get("force_silence_only", True)) # Default to silence-based
579
+ silence_thresh = int(diar_cfg.get("silence_thresh", -40))
580
+ min_silence_len = int(diar_cfg.get("min_silence_len", 500))
581
+
582
+ clip_paths, diar_segs, diar_info, connection_logs = diarize_audio(
583
+ wav_path, out_dir, "clips", min_dur, max_dur,
584
+ force_silence_only=force_silence,
585
+ silence_thresh=silence_thresh,
586
+ min_silence_len=min_silence_len
587
+ )
588
  log.info("Clips de audio generados.")
589
 
590
  full_transcription = ""
config.yaml CHANGED
@@ -55,8 +55,11 @@ audio_processing:
55
 
56
  diarization:
57
  enabled: true
 
58
  min_segment_duration: 0.5 # en segundos (clips cortos)
59
  max_segment_duration: 10.0
 
 
60
 
61
  enable_voice_embeddings: true # SpeechBrain ECAPA
62
  speaker_embedding:
 
55
 
56
  diarization:
57
  enabled: true
58
+ force_silence_only: true # Use silence-based segmentation (no pyannote)
59
  min_segment_duration: 0.5 # en segundos (clips cortos)
60
  max_segment_duration: 10.0
61
+ silence_thresh: -40 # dBFS threshold for silence detection
62
+ min_silence_len: 500 # milliseconds
63
 
64
  enable_voice_embeddings: true # SpeechBrain ECAPA
65
  speaker_embedding: