#!/usr/bin/env python3 import os import sys import subprocess import tempfile import argparse import time import re import gradio as gr from collections import defaultdict from typing import Dict, List, Tuple, Optional # Fix NumPy compatibility before importing torch/torchaudio import numpy as np if hasattr(np, '__version__') and np.__version__.startswith('2.'): # Downgrade numpy if version 2.x is detected subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy<2.0.0", "--force-reinstall"]) # Restart the process os.execv(sys.executable, [sys.executable] + sys.argv) import torch import torchaudio import openai # Try to load .env file if python-dotenv is available try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # python-dotenv not installed, continue without it # Import whisper_at only when needed (for tagging) whisper = None # Define known vocal and instrumental tags for classification VOCAL_TAGS = { "Singing", "Speech", "Choir", "Female singing", "Male singing", "Chant", "Yodeling", "Shout", "Bellow", "Rapping", "Narration", "Child singing", "Vocal music", "Opera", "A capella", "Voice", "Male speech, man speaking", "Female speech, woman speaking", "Child speech, kid speaking", "Conversation", "Narration, monologue", "Babbling", "Speech synthesizer", "Whoop", "Yell", "Battle cry", "Children shouting", "Screaming", "Whispering", "Mantra", "Synthetic singing", "Humming", "Whistling", "Beatboxing", "Gospel music", "Lullaby", "Groan", "Grunt" } # Definitive speech tags that guarantee vocal classification DEFINITIVE_SPEECH_TAGS = { "Male speech, man speaking", "Female speech, woman speaking", "Child speech, kid speaking", "Conversation", "Narration, monologue" } INSTRUMENTAL_TAGS = { "Piano", "Electric piano", "Keyboard (musical)", "Synthesizer", "Organ", "Electronic organ", "Harpsichord", "Guitar", "Bass guitar", "Drums", "Violin", "Trumpet", "Flute", "Saxophone", "Plucked string instrument", "Electric guitar", "Acoustic guitar", "Steel guitar, slide guitar", "Banjo", "Sitar", "Mandolin", "Ukulele", "Hammond organ", "Percussion", "Drum kit", "Drum machine", "Drum", "Snare drum", "Bass drum", "Timpani", "Tabla", "Cymbal", "Hi-hat", "Tambourine", "Marimba, xylophone", "Vibraphone", "Brass instrument", "French horn", "Trombone", "Bowed string instrument", "String section", "Violin, fiddle", "Cello", "Double bass", "Wind instrument, woodwind instrument", "Clarinet", "Harp", "Harmonica", "Accordion" } # Genre tags for fancy music classification GENRE_TAGS = { # Main genres "Pop music", "Rock music", "Jazz", "Classical music", "Electronic music", "Blues", "Country", "Folk music", "Reggae", "Funk", "Soul music", "Rhythm and blues", "Gospel music", "Opera", "Hip hop music", # Electronic subgenres "House music", "Techno", "Dubstep", "Drum and bass", "Electronica", "Electronic dance music", "Ambient music", "Trance music", # Rock subgenres "Heavy metal", "Punk rock", "Grunge", "Progressive rock", "Rock and roll", "Psychedelic rock", # World music "Music of Latin America", "Salsa music", "Flamenco", "Music of Africa", "Afrobeat", "Music of Asia", "Carnatic music", "Music of Bollywood", "Middle Eastern music", "Traditional music", # Other genres "Swing music", "Bluegrass", "Ska", "Disco", "New-age music", "Independent music", "Christian music", "Soundtrack music", "Theme music", "Video game music", "Dance music", "Wedding music", "Christmas music", "Music for children", # Mood/style tags "Happy music", "Funny music", "Sad music", "Tender music", "Exciting music", "Angry music", "Scary music", # Vocal styles "A capella", "Vocal music", "Choir", "Chant", "Mantra", "Lullaby", "Beatboxing", "Rapping", "Yodeling" } class AudioCaptionGenerator: def __init__(self, api_key: str): """ Initialize the caption generator with OpenAI API key Args: api_key (str): Your OpenAI API key """ self.client = openai.OpenAI(api_key=api_key) def create_caption_prompt(self, classification: str, genres: List[str]) -> str: """ Create a prompt for OpenAI to generate an audio caption Args: classification (str): Type of audio (Speech/Vocal, Song, Instrumental) genres (List[str]): List of genre tags Returns: str: Formatted prompt for OpenAI """ genre_list = ", ".join(genres) if genres else "Various" prompt = f"""Create a descriptive and engaging caption for an audio track with the following characteristics: Classification: {classification} Genres: {genre_list} Please write a caption that: 1. Describes the audio in an engaging way 2. Incorporates the key genres naturally 3. Matches the classification type (vocal/instrumental/speech) 4. Is suitable for social media or music platforms 5. Is concise but descriptive (1-2 sentences) Caption:""" return prompt def generate_caption(self, classification: str, genres: List[str], model: str = "gpt-3.5-turbo", temperature: float = 0.7) -> Dict: """ Generate a caption for the audio based on classification and genres Args: classification (str): Audio classification genres (List[str]): List of detected genres model (str): OpenAI model to use temperature (float): Creativity level (0.0 to 1.0) Returns: Dict: Contains original data, prompt, and generated caption """ try: # Create prompt prompt = self.create_caption_prompt(classification, genres) # Generate caption using OpenAI response = self.client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a creative music and audio content writer who creates engaging captions for audio tracks."}, {"role": "user", "content": prompt} ], temperature=temperature, max_tokens=150 ) caption = response.choices[0].message.content.strip() return { "success": True, "classification": classification, "genres": genres, "prompt": prompt, "caption": caption, "model_used": model } except Exception as e: return { "success": False, "error": str(e), "classification": classification, "genres": genres, "caption": None } def load_vad_model(): """Load Silero VAD model""" try: model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, trust_repo=True) get_speech_timestamps = utils[0] return model, get_speech_timestamps except Exception as e: print(f"Error loading VAD model: {e}") raise def convert_audio_with_ffmpeg(input_path, output_path): """Convert audio file to WAV format using ffmpeg""" try: cmd = [ 'ffmpeg', '-i', input_path, '-ar', '16000', # Set sample rate to 16kHz '-ac', '1', # Convert to mono '-y', # Overwrite output file output_path ] result = subprocess.run(cmd, check=True, capture_output=True, text=True) return True except (subprocess.CalledProcessError, FileNotFoundError) as e: print(f"FFmpeg conversion failed: {e}") return False def detect_vocals_vad(file_path, model, get_speech_timestamps): """Detect vocals using VAD and return detection results""" if not os.path.exists(file_path): return None, f"Error: Audio file '{file_path}' not found." waveform = None sample_rate = None temp_file = None try: # Try to load the audio file directly with torchaudio try: waveform, sample_rate = torchaudio.load(file_path) except Exception as e1: print(f"Direct loading failed: {e1}") # Try converting with ffmpeg temp_file = tempfile.NamedTemporaryFile(suffix='.wav', delete=False) temp_file.close() if convert_audio_with_ffmpeg(file_path, temp_file.name): try: waveform, sample_rate = torchaudio.load(temp_file.name) except Exception as e2: return None, f"Error loading converted audio: {str(e2)}" else: return None, f"Failed to convert audio file: {str(e1)}" # Convert to mono if stereo if len(waveform.shape) > 1 and waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Ensure sample rate is 16000 Hz as required by Silero VAD if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform) sample_rate = 16000 # Flatten to 1D array if needed if len(waveform.shape) > 1: waveform = waveform.squeeze() # Get speech timestamps using Silero VAD speech_timestamps = get_speech_timestamps(waveform, model, threshold=0.5, sampling_rate=16000) # Calculate speech statistics total_duration_seconds = len(waveform) / sample_rate speech_duration = sum([t['end'] - t['start'] for t in speech_timestamps]) / sample_rate speech_percentage = (speech_duration / total_duration_seconds) * 100 if total_duration_seconds > 0 else 0 # Determine if vocal is detected vocal_detected = len(speech_timestamps) > 0 and speech_percentage > 1.0 return { 'vocal_detected': vocal_detected, 'speech_percentage': round(speech_percentage, 2), 'total_duration': round(total_duration_seconds, 2), 'speech_duration': round(speech_duration, 2) }, None except Exception as e: return None, f"Error processing audio: {str(e)}" finally: # Clean up temporary file if created if temp_file and os.path.exists(temp_file.name): try: os.unlink(temp_file.name) except: pass def extract_genre_tags(top_tags): """Extract genre tags from detected top tags""" detected_genres = [] for tag in top_tags: if tag in GENRE_TAGS: detected_genres.append(tag) return detected_genres def extract_instrumental_tags(top_tags): """Extract instrumental tags from detected top tags""" detected_instruments = [] for tag in top_tags: if tag in INSTRUMENTAL_TAGS: detected_instruments.append(tag) return detected_instruments def classify_audio_tags(top_tags): """Classify audio based on detected tags""" # Check for definitive speech tags first - if any are present, it's definitely vocal has_definitive_speech = any(tag in DEFINITIVE_SPEECH_TAGS for tag in top_tags) if has_definitive_speech: return "Vocal" # Regular classification logic as fallback has_vocal = any(tag in VOCAL_TAGS for tag in top_tags) has_instrumental = any(tag in INSTRUMENTAL_TAGS for tag in top_tags) if has_vocal and not has_instrumental: return "Vocal" elif has_instrumental and not has_vocal: return "Instrumental" elif has_vocal and has_instrumental: return "Song" else: return "Unknown" def classify_with_tagging(audio_path, model_size="small"): """Classify audio using Whisper-AT tagging""" global whisper # Import whisper_at when needed if whisper is None: try: import whisper_at as whisper except ImportError: return "Error: whisper-at not installed. Please install it first.", [], [], [] audio_tagging_time_resolution = 4.8 try: model = whisper.load_model(model_size) result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution) audio_tag_result = whisper.parse_at_label( result, language='en', top_k=15, p_threshold=-5 ) all_tags_set = set() tag_freq = defaultdict(int) for segment in audio_tag_result: # Update tag set and frequency for tag, score in segment['audio tags']: all_tags_set.add(tag) tag_freq[tag] += 1 # Find top tags (those that appear more than once) top_tags = [tag for tag, freq in tag_freq.items() if freq > 1] # Extract genre and instrumental tags genre_tags = extract_genre_tags(top_tags) instrumental_tags = extract_instrumental_tags(top_tags) classification = classify_audio_tags(top_tags) # Return all detected tags along with classification return classification, genre_tags, instrumental_tags, top_tags except Exception as e: return f"Error in tagging: {str(e)}", [], [], [] def is_vocal_classification(classification): """Check if the classification indicates vocal/speech content""" vocal_keywords = ["vocal", "speech", "song"] return any(keyword in classification.lower() for keyword in vocal_keywords) # Global variables for models vad_model = None get_speech_timestamps = None def initialize_models(): """Initialize VAD model once""" global vad_model, get_speech_timestamps if vad_model is None: try: print("Loading VAD model...") vad_model, get_speech_timestamps = load_vad_model() print("VAD model loaded successfully!") except Exception as e: raise Exception(f"Error loading VAD model: {e}") def process_audio_file(audio_file, model_size, vad_only, openai_key, generate_caption, caption_model): """Process audio file and return results""" try: # Initialize models if needed initialize_models() if audio_file is None: return "❌ Please upload an audio file.", "", "", "", "" audio_path = audio_file # Step 1: VAD Detection vad_result, vad_error = detect_vocals_vad(audio_path, vad_model, get_speech_timestamps) if vad_error: return f"❌ {vad_error}", "", "", "", "" vad_info = f"""📊 **VAD Analysis Results:** • Vocal Detected: {'✅ Yes' if vad_result['vocal_detected'] else '❌ No'} • Speech Percentage: {vad_result['speech_percentage']}% • Total Duration: {vad_result['total_duration']} seconds • Speech Duration: {vad_result['speech_duration']} seconds""" # Step 2: Classification Logic if vad_only: # VAD-only mode if vad_result['vocal_detected']: final_classification = "🎤 Vocal (VAD detected)" reason = "VAD-only mode - vocals detected" else: final_classification = "🎵 Instrumental (No vocals detected by VAD)" reason = "VAD-only mode - no vocals detected" detected_genres = [] detected_instruments = [] all_detected_tags = [] else: # ALWAYS run tagging for detailed classification (whether vocals detected or not) tag_classification, detected_genres, detected_instruments, all_detected_tags = classify_with_tagging(audio_path, model_size) # Use VAD as the definitive decision maker for vocal vs instrumental if vad_result['vocal_detected']: # VAD detected vocals - use tagging for detailed classification final_classification = f"🎤 {tag_classification}" reason = "Vocals detected by VAD, classified using audio tagging" else: # VAD detected no vocals - it's instrumental, but we still have tagging data final_classification = "🎵 Instrumental" reason = "No vocals detected by VAD (definitive decision), with audio tagging analysis" # Format classification info classification_info = f"""🎯 **Final Classification:** {final_classification} 📝 **Reason:** {reason}""" # Format genre information if detected_genres: genre_info = "🎭 **Detected Genres/Styles:**\n" for i, genre in enumerate(detected_genres, 1): genre_info += f"• {genre}\n" else: genre_info = "🎭 **Detected Genres/Styles:** None detected" # Format instrumental information - SHOW FOR ALL NON-VOCAL CONTENT if not is_vocal_classification(final_classification) or "Instrumental" in final_classification: # Show instrumental tags for instrumental content if detected_instruments: instrument_info = "🎹 **Detected Instruments:**\n" for i, instrument in enumerate(detected_instruments, 1): instrument_info += f"• {instrument}\n" else: instrument_info = "🎹 **Detected Instruments:** None detected" else: # For vocal content, still show if there are instruments detected if detected_instruments: instrument_info = "🎹 **Detected Instruments (accompanying):**\n" for i, instrument in enumerate(detected_instruments, 1): instrument_info += f"• {instrument}\n" else: instrument_info = "" # Generate caption if requested caption_info = "" if generate_caption and openai_key: try: caption_generator = AudioCaptionGenerator(openai_key) caption_result = caption_generator.generate_caption( final_classification.replace('🎤 ', '').replace('🎵 ', ''), detected_genres, model=caption_model ) if caption_result["success"]: caption_info = f"""✨ **Generated Caption:** "{caption_result["caption"]}" """ else: caption_info = f"❌ **Caption generation failed:** {caption_result['error']}" except Exception as e: caption_info = f"❌ **Error generating caption:** {str(e)}" elif generate_caption and not openai_key: caption_info = "⚠️ **Caption generation skipped:** No OpenAI API key provided" return vad_info, classification_info, genre_info, instrument_info, caption_info except Exception as e: return f"❌ **Error processing audio:** {str(e)}", "", "", "", "" def create_gradio_interface(): """Create and configure the Gradio interface""" # Custom CSS for better styling css = """ .gradio-container { max-width: 1200px !important; margin: auto !important; } .result-box { border: 1px solid #333333; border-radius: 8px; padding: 15px; margin: 10px 0; background-color: #000000; color: #ffffff; } """ with gr.Blocks(css=css, title="🎵 Audio Classification & Caption Generator") as demo: gr.Markdown(""" # 🎵 Audio Classification & Caption Generator Upload an audio file to analyze its content and generate captions. This tool uses: - **Silero VAD** for voice activity detection - **Whisper-AT** for detailed audio tagging and classification - **OpenAI GPT** for intelligent caption generation """) with gr.Row(): with gr.Column(scale=1): gr.Markdown("## 📁 Upload & Settings") audio_input = gr.Audio( label="Upload Audio File (WAV, MP3, FLAC, M4A, etc.)", type="filepath" ) with gr.Accordion("⚙️ Advanced Settings", open=False): model_size = gr.Dropdown( choices=['tiny', 'base', 'small', 'medium', 'large-v1'], value='small', label="Whisper Model Size (larger = more accurate but slower)" ) vad_only = gr.Checkbox( label="VAD Only Mode (skip detailed tagging)", value=False ) gr.Markdown("## 🤖 Caption Generation") generate_caption = gr.Checkbox( label="Generate AI Caption (requires OpenAI API key)", value=False # Default to False for public deployment ) openai_key = gr.Textbox( label="OpenAI API Key (sk-...)", type="password", placeholder="Enter your OpenAI API key for caption generation" ) caption_model = gr.Dropdown( choices=['gpt-3.5-turbo', 'gpt-4', 'gpt-4-turbo'], value='gpt-3.5-turbo', label="Caption Model", info="OpenAI model for caption generation" ) process_btn = gr.Button("🚀 Analyze Audio", variant="primary", size="lg") with gr.Column(scale=2): gr.Markdown("## 📊 Analysis Results") vad_output = gr.Markdown( label="VAD Analysis", elem_classes=["result-box"] ) classification_output = gr.Markdown( label="Classification", elem_classes=["result-box"] ) genre_output = gr.Markdown( label="Genres", elem_classes=["result-box"] ) instrument_output = gr.Markdown( label="Detected Instruments", elem_classes=["result-box"] ) caption_output = gr.Markdown( label="Generated Caption", elem_classes=["result-box"] ) # Event handlers process_btn.click( fn=process_audio_file, inputs=[ audio_input, model_size, vad_only, openai_key, generate_caption, caption_model ], outputs=[ vad_output, classification_output, genre_output, instrument_output, caption_output ] ) # Example files section gr.Markdown(""" ## 💡 Tips - **Supported formats:** WAV, MP3, FLAC, M4A, OGG, and more - **Best results:** Use clear, high-quality audio files - **Processing time:** Depends on file length and model size - **OpenAI API:** Required for caption generation (get yours at [OpenAI](https://platform.openai.com/)) """) return demo def install_system_dependencies(): """Install system dependencies if needed""" try: # Check if ffmpeg is available subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True) except (subprocess.CalledProcessError, FileNotFoundError): print("Installing ffmpeg...") try: subprocess.run(['apt-get', 'update'], check=True) subprocess.run(['apt-get', 'install', '-y', 'ffmpeg'], check=True) except: print("Warning: Could not install ffmpeg. Audio conversion may fail.") def main(): # Install system dependencies install_system_dependencies() # For Hugging Face deployment, we'll run on 0.0.0.0:7860 host = "0.0.0.0" port = 7860 # Create and launch the interface demo = create_gradio_interface() print("🚀 Starting Audio Classification Server...") print(f"📡 Server will run on http://{host}:{port}") demo.launch( server_name=host, server_port=port, share=False, # Don't create external share link on HF show_error=True ) if __name__ == '__main__': main()