|
|
|
|
|
|
|
|
import os |
|
|
import sys |
|
|
import subprocess |
|
|
import tempfile |
|
|
import argparse |
|
|
import time |
|
|
import re |
|
|
import gradio as gr |
|
|
from collections import defaultdict |
|
|
from typing import Dict, List, Tuple, Optional |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
if hasattr(np, '__version__') and np.__version__.startswith('2.'): |
|
|
|
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy<2.0.0", "--force-reinstall"]) |
|
|
|
|
|
os.execv(sys.executable, [sys.executable] + sys.argv) |
|
|
|
|
|
import torch |
|
|
import torchaudio |
|
|
import openai |
|
|
|
|
|
|
|
|
try: |
|
|
from dotenv import load_dotenv |
|
|
load_dotenv() |
|
|
except ImportError: |
|
|
pass |
|
|
|
|
|
|
|
|
whisper = None |
|
|
|
|
|
|
|
|
VOCAL_TAGS = { |
|
|
"Singing", "Speech", "Choir", "Female singing", "Male singing", |
|
|
"Chant", "Yodeling", "Shout", "Bellow", "Rapping", "Narration", |
|
|
"Child singing", "Vocal music", "Opera", "A capella", "Voice", |
|
|
"Male speech, man speaking", "Female speech, woman speaking", |
|
|
"Child speech, kid speaking", "Conversation", "Narration, monologue", |
|
|
"Babbling", "Speech synthesizer", "Whoop", "Yell", "Battle cry", |
|
|
"Children shouting", "Screaming", "Whispering", "Mantra", |
|
|
"Synthetic singing", "Humming", "Whistling", "Beatboxing", |
|
|
"Gospel music", "Lullaby", "Groan", "Grunt" |
|
|
} |
|
|
|
|
|
|
|
|
DEFINITIVE_SPEECH_TAGS = { |
|
|
"Male speech, man speaking", "Female speech, woman speaking", |
|
|
"Child speech, kid speaking", "Conversation", "Narration, monologue" |
|
|
} |
|
|
|
|
|
INSTRUMENTAL_TAGS = { |
|
|
"Piano", "Electric piano", "Keyboard (musical)", "Synthesizer", "Organ", |
|
|
"Electronic organ", "Harpsichord", "Guitar", "Bass guitar", "Drums", "Violin", |
|
|
"Trumpet", "Flute", "Saxophone", "Plucked string instrument", "Electric guitar", |
|
|
"Acoustic guitar", "Steel guitar, slide guitar", "Banjo", "Sitar", "Mandolin", |
|
|
"Ukulele", "Hammond organ", "Percussion", "Drum kit", "Drum machine", "Drum", |
|
|
"Snare drum", "Bass drum", "Timpani", "Tabla", "Cymbal", "Hi-hat", "Tambourine", |
|
|
"Marimba, xylophone", "Vibraphone", "Brass instrument", "French horn", "Trombone", |
|
|
"Bowed string instrument", "String section", "Violin, fiddle", "Cello", "Double bass", |
|
|
"Wind instrument, woodwind instrument", "Clarinet", "Harp", "Harmonica", "Accordion" |
|
|
} |
|
|
|
|
|
|
|
|
GENRE_TAGS = { |
|
|
|
|
|
"Pop music", "Rock music", "Jazz", "Classical music", "Electronic music", |
|
|
"Blues", "Country", "Folk music", "Reggae", "Funk", "Soul music", |
|
|
"Rhythm and blues", "Gospel music", "Opera", "Hip hop music", |
|
|
|
|
|
|
|
|
"House music", "Techno", "Dubstep", "Drum and bass", "Electronica", |
|
|
"Electronic dance music", "Ambient music", "Trance music", |
|
|
|
|
|
|
|
|
"Heavy metal", "Punk rock", "Grunge", "Progressive rock", "Rock and roll", |
|
|
"Psychedelic rock", |
|
|
|
|
|
|
|
|
"Music of Latin America", "Salsa music", "Flamenco", "Music of Africa", |
|
|
"Afrobeat", "Music of Asia", "Carnatic music", "Music of Bollywood", |
|
|
"Middle Eastern music", "Traditional music", |
|
|
|
|
|
|
|
|
"Swing music", "Bluegrass", "Ska", "Disco", "New-age music", |
|
|
"Independent music", "Christian music", "Soundtrack music", |
|
|
"Theme music", "Video game music", "Dance music", "Wedding music", |
|
|
"Christmas music", "Music for children", |
|
|
|
|
|
|
|
|
"Happy music", "Funny music", "Sad music", "Tender music", |
|
|
"Exciting music", "Angry music", "Scary music", |
|
|
|
|
|
|
|
|
"A capella", "Vocal music", "Choir", "Chant", "Mantra", "Lullaby", |
|
|
"Beatboxing", "Rapping", "Yodeling" |
|
|
} |
|
|
|
|
|
class AudioCaptionGenerator: |
|
|
def __init__(self, api_key: str): |
|
|
""" |
|
|
Initialize the caption generator with OpenAI API key |
|
|
|
|
|
Args: |
|
|
api_key (str): Your OpenAI API key |
|
|
""" |
|
|
self.client = openai.OpenAI(api_key=api_key) |
|
|
|
|
|
def create_caption_prompt(self, classification: str, genres: List[str]) -> str: |
|
|
""" |
|
|
Create a prompt for OpenAI to generate an audio caption |
|
|
|
|
|
Args: |
|
|
classification (str): Type of audio (Speech/Vocal, Song, Instrumental) |
|
|
genres (List[str]): List of genre tags |
|
|
|
|
|
Returns: |
|
|
str: Formatted prompt for OpenAI |
|
|
""" |
|
|
genre_list = ", ".join(genres) if genres else "Various" |
|
|
|
|
|
prompt = f"""Create a descriptive and engaging caption for an audio track with the following characteristics: |
|
|
Classification: {classification} |
|
|
Genres: {genre_list} |
|
|
Please write a caption that: |
|
|
1. Describes the audio in an engaging way |
|
|
2. Incorporates the key genres naturally |
|
|
3. Matches the classification type (vocal/instrumental/speech) |
|
|
4. Is suitable for social media or music platforms |
|
|
5. Is concise but descriptive (1-2 sentences) |
|
|
Caption:""" |
|
|
|
|
|
return prompt |
|
|
|
|
|
def generate_caption(self, classification: str, genres: List[str], |
|
|
model: str = "gpt-3.5-turbo", temperature: float = 0.7) -> Dict: |
|
|
""" |
|
|
Generate a caption for the audio based on classification and genres |
|
|
|
|
|
Args: |
|
|
classification (str): Audio classification |
|
|
genres (List[str]): List of detected genres |
|
|
model (str): OpenAI model to use |
|
|
temperature (float): Creativity level (0.0 to 1.0) |
|
|
|
|
|
Returns: |
|
|
Dict: Contains original data, prompt, and generated caption |
|
|
""" |
|
|
try: |
|
|
|
|
|
prompt = self.create_caption_prompt(classification, genres) |
|
|
|
|
|
|
|
|
response = self.client.chat.completions.create( |
|
|
model=model, |
|
|
messages=[ |
|
|
{"role": "system", "content": "You are a creative music and audio content writer who creates engaging captions for audio tracks."}, |
|
|
{"role": "user", "content": prompt} |
|
|
], |
|
|
temperature=temperature, |
|
|
max_tokens=150 |
|
|
) |
|
|
|
|
|
caption = response.choices[0].message.content.strip() |
|
|
|
|
|
return { |
|
|
"success": True, |
|
|
"classification": classification, |
|
|
"genres": genres, |
|
|
"prompt": prompt, |
|
|
"caption": caption, |
|
|
"model_used": model |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
return { |
|
|
"success": False, |
|
|
"error": str(e), |
|
|
"classification": classification, |
|
|
"genres": genres, |
|
|
"caption": None |
|
|
} |
|
|
|
|
|
def load_vad_model(): |
|
|
"""Load Silero VAD model""" |
|
|
try: |
|
|
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', |
|
|
model='silero_vad', |
|
|
force_reload=False, |
|
|
trust_repo=True) |
|
|
|
|
|
get_speech_timestamps = utils[0] |
|
|
return model, get_speech_timestamps |
|
|
except Exception as e: |
|
|
print(f"Error loading VAD model: {e}") |
|
|
raise |
|
|
|
|
|
def convert_audio_with_ffmpeg(input_path, output_path): |
|
|
"""Convert audio file to WAV format using ffmpeg""" |
|
|
try: |
|
|
cmd = [ |
|
|
'ffmpeg', '-i', input_path, |
|
|
'-ar', '16000', |
|
|
'-ac', '1', |
|
|
'-y', |
|
|
output_path |
|
|
] |
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True) |
|
|
return True |
|
|
except (subprocess.CalledProcessError, FileNotFoundError) as e: |
|
|
print(f"FFmpeg conversion failed: {e}") |
|
|
return False |
|
|
|
|
|
def detect_vocals_vad(file_path, model, get_speech_timestamps): |
|
|
"""Detect vocals using VAD and return detection results""" |
|
|
|
|
|
if not os.path.exists(file_path): |
|
|
return None, f"Error: Audio file '{file_path}' not found." |
|
|
|
|
|
waveform = None |
|
|
sample_rate = None |
|
|
temp_file = None |
|
|
|
|
|
try: |
|
|
|
|
|
try: |
|
|
waveform, sample_rate = torchaudio.load(file_path) |
|
|
except Exception as e1: |
|
|
print(f"Direct loading failed: {e1}") |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) |
|
|
temp_file.close() |
|
|
|
|
|
if convert_audio_with_ffmpeg(file_path, temp_file.name): |
|
|
try: |
|
|
waveform, sample_rate = torchaudio.load(temp_file.name) |
|
|
except Exception as e2: |
|
|
return None, f"Error loading converted audio: {str(e2)}" |
|
|
else: |
|
|
return None, f"Failed to convert audio file: {str(e1)}" |
|
|
|
|
|
|
|
|
if len(waveform.shape) > 1 and waveform.shape[0] > 1: |
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True) |
|
|
|
|
|
|
|
|
if sample_rate != 16000: |
|
|
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) |
|
|
waveform = resampler(waveform) |
|
|
sample_rate = 16000 |
|
|
|
|
|
|
|
|
if len(waveform.shape) > 1: |
|
|
waveform = waveform.squeeze() |
|
|
|
|
|
|
|
|
speech_timestamps = get_speech_timestamps(waveform, model, threshold=0.5, sampling_rate=16000) |
|
|
|
|
|
|
|
|
total_duration_seconds = len(waveform) / sample_rate |
|
|
speech_duration = sum([t['end'] - t['start'] for t in speech_timestamps]) / sample_rate |
|
|
speech_percentage = (speech_duration / total_duration_seconds) * 100 if total_duration_seconds > 0 else 0 |
|
|
|
|
|
|
|
|
vocal_detected = len(speech_timestamps) > 0 and speech_percentage > 1.0 |
|
|
|
|
|
return { |
|
|
'vocal_detected': vocal_detected, |
|
|
'speech_percentage': round(speech_percentage, 2), |
|
|
'total_duration': round(total_duration_seconds, 2), |
|
|
'speech_duration': round(speech_duration, 2) |
|
|
}, None |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"Error processing audio: {str(e)}" |
|
|
|
|
|
finally: |
|
|
|
|
|
if temp_file and os.path.exists(temp_file.name): |
|
|
try: |
|
|
os.unlink(temp_file.name) |
|
|
except: |
|
|
pass |
|
|
|
|
|
def extract_genre_tags(top_tags): |
|
|
"""Extract genre tags from detected top tags""" |
|
|
detected_genres = [] |
|
|
for tag in top_tags: |
|
|
if tag in GENRE_TAGS: |
|
|
detected_genres.append(tag) |
|
|
return detected_genres |
|
|
|
|
|
def extract_instrumental_tags(top_tags): |
|
|
"""Extract instrumental tags from detected top tags""" |
|
|
detected_instruments = [] |
|
|
for tag in top_tags: |
|
|
if tag in INSTRUMENTAL_TAGS: |
|
|
detected_instruments.append(tag) |
|
|
return detected_instruments |
|
|
|
|
|
def classify_audio_tags(top_tags): |
|
|
"""Classify audio based on detected tags""" |
|
|
|
|
|
has_definitive_speech = any(tag in DEFINITIVE_SPEECH_TAGS for tag in top_tags) |
|
|
|
|
|
if has_definitive_speech: |
|
|
return "Vocal" |
|
|
|
|
|
|
|
|
has_vocal = any(tag in VOCAL_TAGS for tag in top_tags) |
|
|
has_instrumental = any(tag in INSTRUMENTAL_TAGS for tag in top_tags) |
|
|
|
|
|
if has_vocal and not has_instrumental: |
|
|
return "Vocal" |
|
|
elif has_instrumental and not has_vocal: |
|
|
return "Instrumental" |
|
|
elif has_vocal and has_instrumental: |
|
|
return "Song" |
|
|
else: |
|
|
return "Unknown" |
|
|
|
|
|
def classify_with_tagging(audio_path, model_size="small"): |
|
|
"""Classify audio using Whisper-AT tagging""" |
|
|
global whisper |
|
|
|
|
|
|
|
|
if whisper is None: |
|
|
try: |
|
|
import whisper_at as whisper |
|
|
except ImportError: |
|
|
return "Error: whisper-at not installed. Please install it first.", [], [], [] |
|
|
|
|
|
audio_tagging_time_resolution = 4.8 |
|
|
|
|
|
try: |
|
|
model = whisper.load_model(model_size) |
|
|
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution) |
|
|
|
|
|
audio_tag_result = whisper.parse_at_label( |
|
|
result, |
|
|
language='en', |
|
|
top_k=15, |
|
|
p_threshold=-5 |
|
|
) |
|
|
|
|
|
all_tags_set = set() |
|
|
tag_freq = defaultdict(int) |
|
|
|
|
|
for segment in audio_tag_result: |
|
|
|
|
|
for tag, score in segment['audio tags']: |
|
|
all_tags_set.add(tag) |
|
|
tag_freq[tag] += 1 |
|
|
|
|
|
|
|
|
top_tags = [tag for tag, freq in tag_freq.items() if freq > 1] |
|
|
|
|
|
|
|
|
genre_tags = extract_genre_tags(top_tags) |
|
|
instrumental_tags = extract_instrumental_tags(top_tags) |
|
|
|
|
|
classification = classify_audio_tags(top_tags) |
|
|
|
|
|
|
|
|
return classification, genre_tags, instrumental_tags, top_tags |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error in tagging: {str(e)}", [], [], [] |
|
|
|
|
|
def is_vocal_classification(classification): |
|
|
"""Check if the classification indicates vocal/speech content""" |
|
|
vocal_keywords = ["vocal", "speech", "song"] |
|
|
return any(keyword in classification.lower() for keyword in vocal_keywords) |
|
|
|
|
|
|
|
|
vad_model = None |
|
|
get_speech_timestamps = None |
|
|
|
|
|
def initialize_models(): |
|
|
"""Initialize VAD model once""" |
|
|
global vad_model, get_speech_timestamps |
|
|
if vad_model is None: |
|
|
try: |
|
|
print("Loading VAD model...") |
|
|
vad_model, get_speech_timestamps = load_vad_model() |
|
|
print("VAD model loaded successfully!") |
|
|
except Exception as e: |
|
|
raise Exception(f"Error loading VAD model: {e}") |
|
|
|
|
|
def process_audio_file(audio_file, model_size, vad_only, openai_key, generate_caption, caption_model): |
|
|
"""Process audio file and return results""" |
|
|
try: |
|
|
|
|
|
initialize_models() |
|
|
|
|
|
if audio_file is None: |
|
|
return "β Please upload an audio file.", "", "", "", "" |
|
|
|
|
|
audio_path = audio_file |
|
|
|
|
|
|
|
|
vad_result, vad_error = detect_vocals_vad(audio_path, vad_model, get_speech_timestamps) |
|
|
|
|
|
if vad_error: |
|
|
return f"β {vad_error}", "", "", "", "" |
|
|
|
|
|
vad_info = f"""π **VAD Analysis Results:** |
|
|
β’ Vocal Detected: {'β
Yes' if vad_result['vocal_detected'] else 'β No'} |
|
|
β’ Speech Percentage: {vad_result['speech_percentage']}% |
|
|
β’ Total Duration: {vad_result['total_duration']} seconds |
|
|
β’ Speech Duration: {vad_result['speech_duration']} seconds""" |
|
|
|
|
|
|
|
|
if vad_only: |
|
|
|
|
|
if vad_result['vocal_detected']: |
|
|
final_classification = "π€ Vocal (VAD detected)" |
|
|
reason = "VAD-only mode - vocals detected" |
|
|
else: |
|
|
final_classification = "π΅ Instrumental (No vocals detected by VAD)" |
|
|
reason = "VAD-only mode - no vocals detected" |
|
|
detected_genres = [] |
|
|
detected_instruments = [] |
|
|
all_detected_tags = [] |
|
|
else: |
|
|
|
|
|
tag_classification, detected_genres, detected_instruments, all_detected_tags = classify_with_tagging(audio_path, model_size) |
|
|
|
|
|
|
|
|
if vad_result['vocal_detected']: |
|
|
|
|
|
final_classification = f"π€ {tag_classification}" |
|
|
reason = "Vocals detected by VAD, classified using audio tagging" |
|
|
else: |
|
|
|
|
|
final_classification = "π΅ Instrumental" |
|
|
reason = "No vocals detected by VAD (definitive decision), with audio tagging analysis" |
|
|
|
|
|
|
|
|
classification_info = f"""π― **Final Classification:** {final_classification} |
|
|
π **Reason:** {reason}""" |
|
|
|
|
|
|
|
|
if detected_genres: |
|
|
genre_info = "π **Detected Genres/Styles:**\n" |
|
|
for i, genre in enumerate(detected_genres, 1): |
|
|
genre_info += f"β’ {genre}\n" |
|
|
else: |
|
|
genre_info = "π **Detected Genres/Styles:** None detected" |
|
|
|
|
|
|
|
|
if not is_vocal_classification(final_classification) or "Instrumental" in final_classification: |
|
|
|
|
|
if detected_instruments: |
|
|
instrument_info = "πΉ **Detected Instruments:**\n" |
|
|
for i, instrument in enumerate(detected_instruments, 1): |
|
|
instrument_info += f"β’ {instrument}\n" |
|
|
else: |
|
|
instrument_info = "πΉ **Detected Instruments:** None detected" |
|
|
else: |
|
|
|
|
|
if detected_instruments: |
|
|
instrument_info = "πΉ **Detected Instruments (accompanying):**\n" |
|
|
for i, instrument in enumerate(detected_instruments, 1): |
|
|
instrument_info += f"β’ {instrument}\n" |
|
|
else: |
|
|
instrument_info = "" |
|
|
|
|
|
|
|
|
caption_info = "" |
|
|
if generate_caption and openai_key: |
|
|
try: |
|
|
caption_generator = AudioCaptionGenerator(openai_key) |
|
|
caption_result = caption_generator.generate_caption( |
|
|
final_classification.replace('π€ ', '').replace('π΅ ', ''), |
|
|
detected_genres, |
|
|
model=caption_model |
|
|
) |
|
|
|
|
|
if caption_result["success"]: |
|
|
caption_info = f"""β¨ **Generated Caption:** |
|
|
"{caption_result["caption"]}" |
|
|
""" |
|
|
else: |
|
|
caption_info = f"β **Caption generation failed:** {caption_result['error']}" |
|
|
|
|
|
except Exception as e: |
|
|
caption_info = f"β **Error generating caption:** {str(e)}" |
|
|
elif generate_caption and not openai_key: |
|
|
caption_info = "β οΈ **Caption generation skipped:** No OpenAI API key provided" |
|
|
|
|
|
return vad_info, classification_info, genre_info, instrument_info, caption_info |
|
|
|
|
|
except Exception as e: |
|
|
return f"β **Error processing audio:** {str(e)}", "", "", "", "" |
|
|
|
|
|
def create_gradio_interface(): |
|
|
"""Create and configure the Gradio interface""" |
|
|
|
|
|
|
|
|
css = """ |
|
|
.gradio-container { |
|
|
max-width: 1200px !important; |
|
|
margin: auto !important; |
|
|
} |
|
|
.result-box { |
|
|
border: 1px solid #333333; |
|
|
border-radius: 8px; |
|
|
padding: 15px; |
|
|
margin: 10px 0; |
|
|
background-color: #000000; |
|
|
color: #ffffff; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(css=css, title="π΅ Audio Classification & Caption Generator") as demo: |
|
|
gr.Markdown(""" |
|
|
# π΅ Audio Classification & Caption Generator |
|
|
|
|
|
Upload an audio file to analyze its content and generate captions. This tool uses: |
|
|
- **Silero VAD** for voice activity detection |
|
|
- **Whisper-AT** for detailed audio tagging and classification |
|
|
- **OpenAI GPT** for intelligent caption generation |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("## π Upload & Settings") |
|
|
|
|
|
audio_input = gr.Audio( |
|
|
label="Upload Audio File (WAV, MP3, FLAC, M4A, etc.)", |
|
|
type="filepath" |
|
|
) |
|
|
|
|
|
with gr.Accordion("βοΈ Advanced Settings", open=False): |
|
|
model_size = gr.Dropdown( |
|
|
choices=['tiny', 'base', 'small', 'medium', 'large-v1'], |
|
|
value='small', |
|
|
label="Whisper Model Size (larger = more accurate but slower)" |
|
|
) |
|
|
|
|
|
vad_only = gr.Checkbox( |
|
|
label="VAD Only Mode (skip detailed tagging)", |
|
|
value=False |
|
|
) |
|
|
|
|
|
gr.Markdown("## π€ Caption Generation") |
|
|
|
|
|
generate_caption = gr.Checkbox( |
|
|
label="Generate AI Caption (requires OpenAI API key)", |
|
|
value=False |
|
|
) |
|
|
|
|
|
openai_key = gr.Textbox( |
|
|
label="OpenAI API Key (sk-...)", |
|
|
type="password", |
|
|
placeholder="Enter your OpenAI API key for caption generation" |
|
|
) |
|
|
|
|
|
caption_model = gr.Dropdown( |
|
|
choices=['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo'], |
|
|
value='gpt-3.5-turbo', |
|
|
label="Caption Model", |
|
|
info="OpenAI model for caption generation" |
|
|
) |
|
|
|
|
|
process_btn = gr.Button("π Analyze Audio", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=2): |
|
|
gr.Markdown("## π Analysis Results") |
|
|
|
|
|
vad_output = gr.Markdown( |
|
|
label="VAD Analysis", |
|
|
elem_classes=["result-box"] |
|
|
) |
|
|
|
|
|
classification_output = gr.Markdown( |
|
|
label="Classification", |
|
|
elem_classes=["result-box"] |
|
|
) |
|
|
|
|
|
genre_output = gr.Markdown( |
|
|
label="Genres", |
|
|
elem_classes=["result-box"] |
|
|
) |
|
|
|
|
|
instrument_output = gr.Markdown( |
|
|
label="Detected Instruments", |
|
|
elem_classes=["result-box"] |
|
|
) |
|
|
|
|
|
caption_output = gr.Markdown( |
|
|
label="Generated Caption", |
|
|
elem_classes=["result-box"] |
|
|
) |
|
|
|
|
|
|
|
|
process_btn.click( |
|
|
fn=process_audio_file, |
|
|
inputs=[ |
|
|
audio_input, |
|
|
model_size, |
|
|
vad_only, |
|
|
openai_key, |
|
|
generate_caption, |
|
|
caption_model |
|
|
], |
|
|
outputs=[ |
|
|
vad_output, |
|
|
classification_output, |
|
|
genre_output, |
|
|
instrument_output, |
|
|
caption_output |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown(""" |
|
|
## π‘ Tips |
|
|
- **Supported formats:** WAV, MP3, FLAC, M4A, OGG, and more |
|
|
- **Best results:** Use clear, high-quality audio files |
|
|
- **Processing time:** Depends on file length and model size |
|
|
- **OpenAI API:** Required for caption generation (get yours at [OpenAI](https://platform.openai.com/)) |
|
|
""") |
|
|
|
|
|
return demo |
|
|
|
|
|
def install_system_dependencies(): |
|
|
"""Install system dependencies if needed""" |
|
|
try: |
|
|
|
|
|
subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True) |
|
|
except (subprocess.CalledProcessError, FileNotFoundError): |
|
|
print("Installing ffmpeg...") |
|
|
try: |
|
|
subprocess.run(['apt-get', 'update'], check=True) |
|
|
subprocess.run(['apt-get', 'install', '-y', 'ffmpeg'], check=True) |
|
|
except: |
|
|
print("Warning: Could not install ffmpeg. Audio conversion may fail.") |
|
|
|
|
|
def main(): |
|
|
|
|
|
install_system_dependencies() |
|
|
|
|
|
|
|
|
host = "0.0.0.0" |
|
|
port = 7860 |
|
|
|
|
|
|
|
|
demo = create_gradio_interface() |
|
|
|
|
|
print("π Starting Audio Classification Server...") |
|
|
print(f"π‘ Server will run on http://{host}:{port}") |
|
|
|
|
|
demo.launch( |
|
|
server_name=host, |
|
|
server_port=port, |
|
|
share=False, |
|
|
show_error=True |
|
|
) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |