karesaeedff commited on
Commit
f245e46
·
verified ·
1 Parent(s): cdce1be

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +9 -13
  2. app.py +87 -0
  3. requirements.txt +8 -0
README.md CHANGED
@@ -1,13 +1,9 @@
1
- ---
2
- title: Singing Segment Detector
3
- emoji: 🚀
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # 🎤 Singing Segment Detector
2
+
3
+ 这是一个基于 Hugging Face + Gradio 的 AI 工具,
4
+ 可以自动识别长音频中的唱歌片段并输出时间戳。
5
+
6
+ ### 使用方法
7
+ 1. 上传从视频中提取的音频(例如 ffmpeg 抽取的 WAV)
8
+ 2. 点击“开始分析”
9
+ 3. 查看唱歌片段时间戳 JSON
 
 
 
 
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from transformers import pipeline
6
+ from tqdm import tqdm
7
+ import tempfile
8
+ import json
9
+ import soundfile as sf
10
+
11
+ # ==== 参数设置 ====
12
+ SAMPLE_RATE = 8000 # 降采样,节省计算
13
+ WINDOW = 5 # 每个分析窗口长度(秒)
14
+ STEP = 2 # 滑动步长(秒)
15
+ MUSIC_THRESHOLD = 0.4
16
+ VOICE_THRESHOLD = 0.3
17
+ MIN_SING_DURATION = 8 # 最短唱歌片段(秒)
18
+
19
+ # ==== 初始化模型 ====
20
+ music_pipe = pipeline("audio-classification", model="AI-Music-Detection/ai_music_detection_large_60s")
21
+ voice_pipe = pipeline("audio-classification", model="superb/hubert-large-superb-sid")
22
+
23
+ def detect_singing(audio_path):
24
+ """核心:检测唱歌时间戳"""
25
+ wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
26
+ duration = len(wav) / SAMPLE_RATE
27
+ results = []
28
+
29
+ for start in np.arange(0, duration - WINDOW, STEP):
30
+ end = start + WINDOW
31
+ snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
32
+
33
+ # 音乐概率
34
+ music_pred = music_pipe(snippet, sampling_rate=SAMPLE_RATE)
35
+ music_score = max([p['score'] for p in music_pred if 'music' in p['label'].lower()] or [0])
36
+
37
+ # 声音概率(有语音活动)
38
+ voice_pred = voice_pipe(snippet, sampling_rate=SAMPLE_RATE)
39
+ voice_score = max([p['score'] for p in voice_pred if 'speech' in p['label'].lower()] or [0])
40
+
41
+ if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
42
+ results.append((float(start), float(end)))
43
+
44
+ # 合并连续区间
45
+ merged = []
46
+ for seg in results:
47
+ if not merged or seg[0] > merged[-1][1]:
48
+ merged.append(list(seg))
49
+ else:
50
+ merged[-1][1] = seg[1]
51
+
52
+ merged = [(s, e) for s, e in merged if e - s >= MIN_SING_DURATION]
53
+
54
+ return merged
55
+
56
+
57
+ def analyze_audio(file):
58
+ """Gradio 接口函数"""
59
+ if file is None:
60
+ return "请上传音频文件", None
61
+
62
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
63
+ # 确保为wav格式
64
+ data, sr = librosa.load(file.name, sr=SAMPLE_RATE)
65
+ sf.write(tmp.name, data, sr)
66
+ segments = detect_singing(tmp.name)
67
+
68
+ if not segments:
69
+ return "未检测到明显唱歌片段", json.dumps([], indent=2)
70
+
71
+ json_output = json.dumps(
72
+ [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
73
+ indent=2
74
+ )
75
+ return f"检测到 {len(segments)} 段唱歌片段", json_output
76
+
77
+
78
+ # ==== Gradio UI ====
79
+ with gr.Blocks(title="🎵 Singing Segment Detector") as demo:
80
+ gr.Markdown("# 🎤 自动识别唱歌片段 (Hugging Face Space)\n上传音频文件,返回检测到的唱歌时间段 JSON。")
81
+ audio_in = gr.Audio(type="filepath", label="上传音频文件(从视频提取后)")
82
+ btn = gr.Button("开始分析")
83
+ status = gr.Textbox(label="分析结果", interactive=False)
84
+ json_out = gr.Code(label="唱歌片段时间戳(JSON)", language="json")
85
+ btn.click(fn=analyze_audio, inputs=[audio_in], outputs=[status, json_out])
86
+
87
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ librosa
3
+ torch
4
+ torchaudio
5
+ transformers
6
+ numpy
7
+ tqdm
8
+ soundfile