#!/usr/bin/env python3 """ Gradio English-accent detector (N random × 8-second slices) """ from __future__ import annotations import argparse, random, tempfile from collections import Counter from pathlib import Path import torch import torchaudio import gradio as gr from speechbrain.inference.classifiers import EncoderClassifier from yt_dlp import YoutubeDL # ─────────────── Model setup ─────────────── ACCENT_MODEL_ID = "Jzuluaga/accent-id-commonaccent_ecapa" LANG_MODEL_ID = "speechbrain/lang-id-voxlingua107-ecapa" # Force CPU DEVICE = "cpu" accent_clf = EncoderClassifier.from_hparams( source=ACCENT_MODEL_ID, run_opts={"device": DEVICE} ) lang_clf = EncoderClassifier.from_hparams( source=LANG_MODEL_ID, run_opts={"device": DEVICE} ) # ─────────────── Helpers ─────────────── def sec_to_hms(sec: int) -> str: h = sec // 3600 m = (sec % 3600) // 60 s = sec % 60 return f"{h:02d}:{m:02d}:{s:02d}" def download_audio(url: str, out_path: Path) -> Path: """ Download best audio only via yt_dlp Python API. Returns the actual file saved (could be .m4a, .webm, etc.). """ opts = { "format": "bestaudio/best", "outtmpl": str(out_path.with_suffix(".%(ext)s")), "postprocessors": [], "quiet": True, } with YoutubeDL(opts) as ydl: info = ydl.extract_info(url, download=True) filename = ydl.prepare_filename(info) return Path(filename) def extract_wav(src: Path, dst: Path, start: int, dur: int = 8) -> None: target_sr = 16000 offset = start * target_sr frames = dur * target_sr wav, orig_sr = torchaudio.load(str(src), frame_offset=offset, num_frames=frames) if orig_sr != target_sr: wav = torchaudio.transforms.Resample(orig_sr, target_sr)(wav) torchaudio.save(str(dst), wav, target_sr, encoding="PCM_S", bits_per_sample=16) def pick_random_offsets(total_s: int, n: int) -> list[int]: max_start = total_s - 8 pool = list(range(max_start + 1)) if n > len(pool): n = len(pool) return random.sample(pool, n) # ─────────────── Classification ─────────────── def classify_language(wav: Path) -> tuple[str, float]: sig = lang_clf.load_audio(str(wav)) _, log_p, _, label = lang_clf.classify_batch(sig) return label[0], float(log_p.exp().item()) * 100 def classify_accent(wav: Path) -> tuple[str, float]: sig = accent_clf.load_audio(str(wav)) _, log_p, _, label = accent_clf.classify_batch(sig) return label[0], float(log_p.item()) * 100 # ─────────────── Core pipeline ─────────────── def analyse(url: str, n_samples: int) -> dict: if not url: return {"error": "Please provide a video URL."} if n_samples < 1: return {"error": "Number of samples must be at least 1."} with tempfile.TemporaryDirectory() as td: td = Path(td) # 1) Download audio audio_file = td / "audio" audio_file = download_audio(url, audio_file) # 2) Read metadata for total seconds info = torchaudio.info(str(audio_file)) total_s = int(info.num_frames / info.sample_rate) if total_s < 8: return {"error": "Audio shorter than 8 seconds."} # 3) Language check on middle slice mid_start = max(0, total_s // 2 - 4) lang_wav = td / "lang_check.wav" extract_wav(audio_file, lang_wav, start=mid_start) lang, lang_conf = classify_language(lang_wav) if not lang.lower().startswith("en"): return { "language": lang, "language_confidence": round(lang_conf, 1), "summary": f"Detected {lang} ({lang_conf:.1f}%). Accent skipped." } # 4) Random accent slices offsets = pick_random_offsets(total_s, n_samples) per_clip = [] for i, start in enumerate(sorted(offsets)): clip_wav = td / f"clip_{i}.wav" extract_wav(audio_file, clip_wav, start=start) acc, conf = classify_accent(clip_wav) per_clip.append({ "clip": i, "start": sec_to_hms(start), "end": sec_to_hms(start + 8), "accent": acc, "confidence": round(conf, 1), }) # 5) Majority-vote aggregation labels = [c["accent"] for c in per_clip] majority, count = Counter(labels).most_common(1)[0] avg_conf = round( sum(c["confidence"] for c in per_clip if c["accent"] == majority) / count, 1 ) return { "language": "English", "language_confidence": round(lang_conf, 1), "accent_overall": majority, "overall_confidence_avg": avg_conf, "per_clip": per_clip, "summary": ( f"English detected. Overall accent = {majority} " f"(≈{avg_conf}% on {count}/{n_samples} slices)." ) } # ─────────────── Gradio UI ─────────────── def app(): with gr.Blocks(title="Random-slice English Accent Classifier") as demo: gr.Markdown( "### 🎙️ English-Accent Detector (random 8-s slices)\n" "- Paste a public video/YouTube/Loom URL\n" "- Choose how many random 8 s samples to analyse" ) url = gr.Text(label="Video URL (public)") nclip = gr.Slider(1, 10, value=4, step=1, label="Number of random 8 s samples") btn = gr.Button("Analyse") out = gr.JSON(label="Result") btn.click(analyse, inputs=[url, nclip], outputs=out) return demo if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--share", action="store_true", help="Enable public share link") args = parser.parse_args() app().launch(share=args.share)