ravinder024's picture
v1 app.py added
db214be verified
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import librosa
import numpy as np
import os
import tempfile
from datetime import datetime
# Global variables for model and processor
processor = None
model = None
def load_model():
"""Load the Voxtral model and processor"""
global processor, model
if processor is not None and model is not None:
return processor, model
try:
model_name = "mistralai/Voxtral-Small-24B-2507"
print("Loading Voxtral model... This may take a few minutes.")
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True,
trust_remote_code=True
)
print("Model loaded successfully!")
return processor, model
except Exception as e:
print(f"Error loading model: {str(e)}")
return None, None
def transcribe_audio(audio_file):
"""Process audio file and return transcription"""
if audio_file is None:
return "Please upload an audio file.", "", ""
try:
# Load model if not already loaded
global processor, model
if processor is None or model is None:
processor, model = load_model()
if processor is None or model is None:
return "Error: Model failed to load. Please try again.", "", ""
# Load audio file
if isinstance(audio_file, str):
# If it's a file path
audio, sample_rate = librosa.load(audio_file, sr=16000)
else:
# If it's uploaded file data
audio, sample_rate = librosa.load(audio_file.name, sr=16000)
# Calculate duration
duration = len(audio) / sample_rate
# Process with the model
inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
# Move inputs to the same device as model
if torch.cuda.is_available():
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
with torch.no_grad():
predicted_ids = model.generate(**inputs, max_length=512)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Generate file info
word_count = len(transcription.split())
file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}"
return transcription, file_info, transcription # Return transcription twice for download
except Exception as e:
error_msg = f"Error processing audio: {str(e)}"
print(error_msg)
return error_msg, "", ""
def clear_inputs():
"""Clear all inputs and outputs"""
return None, "", "", ""
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}
.main-header {
text-align: center;
color: #2d5aa0;
margin-bottom: 20px;
}
.info-box {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
padding: 20px;
border-radius: 10px;
margin: 10px 0;
}
.result-box {
background-color: #f8f9fa;
border: 1px solid #e9ecef;
border-radius: 8px;
padding: 15px;
margin: 10px 0;
}
"""
# Create the Gradio interface
def create_interface():
with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:
# Header
gr.Markdown(
"""
# 🎀 Voxtral-Small-24B Speech Recognition
Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
""",
elem_classes=["main-header"]
)
# Model info
with gr.Accordion("ℹ️ About this model", open=False):
gr.Markdown(
"""
**Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI.
- **Model**: mistralai/Voxtral-Small-24B-2507
- **Type**: Speech-to-Text Transformation
- **Developer**: Mistral AI
- **Use Case**: Audio transcription and speech recognition
- **Supported Formats**: WAV, MP3, FLAC, M4A, OGG
πŸ’‘ **Tip**: For best results, use clear audio files with minimal background noise.
"""
)
with gr.Row():
with gr.Column(scale=1):
# Audio input
audio_input = gr.Audio(
label="πŸ“ Upload Audio File",
type="filepath",
sources=["upload", "microphone"]
)
# Control buttons
with gr.Row():
transcribe_btn = gr.Button(
"πŸš€ Transcribe Audio",
variant="primary",
size="lg"
)
clear_btn = gr.Button(
"πŸ—‘οΈ Clear",
variant="secondary"
)
with gr.Column(scale=1):
# Results
transcription_output = gr.Textbox(
label="πŸ“ Transcription Result",
lines=8,
max_lines=15,
placeholder="Transcribed text will appear here...",
show_copy_button=True
)
# File info
info_output = gr.Textbox(
label="πŸ“Š Audio Information",
lines=1,
placeholder="Audio details will appear here..."
)
# Download option
download_file = gr.File(
label="πŸ’Ύ Download Transcription",
visible=False
)
# Hidden textbox for file content (for download)
hidden_text = gr.Textbox(visible=False)
# Event handlers
transcribe_btn.click(
fn=transcribe_audio,
inputs=[audio_input],
outputs=[transcription_output, info_output, hidden_text],
show_progress=True
)
# Update download file when transcription is complete
def update_download(text_content):
if text_content and text_content.strip():
# Create a temporary file with the transcription
temp_file = tempfile.NamedTemporaryFile(
mode='w',
delete=False,
suffix='.txt',
prefix='transcription_'
)
temp_file.write(text_content)
temp_file.close()
return gr.File(value=temp_file.name, visible=True)
else:
return gr.File(visible=False)
hidden_text.change(
fn=update_download,
inputs=[hidden_text],
outputs=[download_file]
)
clear_btn.click(
fn=clear_inputs,
outputs=[audio_input, transcription_output, info_output, hidden_text]
)
# Footer
gr.Markdown(
"""
---
### πŸ› οΈ Usage Instructions:
1. **Upload**: Click on the audio input area to upload a file or use your microphone
2. **Transcribe**: Click the "Transcribe Audio" button to process your audio
3. **Results**: View your transcription in the text area on the right
4. **Download**: Use the download button to save your transcription as a text file
**Supported formats**: WAV, MP3, FLAC, M4A, OGG
"""
)
return demo
# Initialize and launch the app
if __name__ == "__main__":
# Pre-load the model when the app starts
print("Initializing Voxtral model...")
load_model()
# Create and launch the interface
demo = create_interface()
demo.launch(
share=True,
show_error=True,
server_name="0.0.0.0",
server_port=7860
)