|
|
import gradio as gr |
|
|
import torch |
|
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
|
|
import librosa |
|
|
import numpy as np |
|
|
import os |
|
|
import tempfile |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
processor = None |
|
|
model = None |
|
|
|
|
|
def load_model(): |
|
|
"""Load the Voxtral model and processor""" |
|
|
global processor, model |
|
|
|
|
|
if processor is not None and model is not None: |
|
|
return processor, model |
|
|
|
|
|
try: |
|
|
model_name = "mistralai/Voxtral-Small-24B-2507" |
|
|
|
|
|
print("Loading Voxtral model... This may take a few minutes.") |
|
|
processor = AutoProcessor.from_pretrained(model_name) |
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
low_cpu_mem_usage=True, |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
print("Model loaded successfully!") |
|
|
return processor, model |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error loading model: {str(e)}") |
|
|
return None, None |
|
|
|
|
|
def transcribe_audio(audio_file): |
|
|
"""Process audio file and return transcription""" |
|
|
if audio_file is None: |
|
|
return "Please upload an audio file.", "", "" |
|
|
|
|
|
try: |
|
|
|
|
|
global processor, model |
|
|
if processor is None or model is None: |
|
|
processor, model = load_model() |
|
|
|
|
|
if processor is None or model is None: |
|
|
return "Error: Model failed to load. Please try again.", "", "" |
|
|
|
|
|
|
|
|
if isinstance(audio_file, str): |
|
|
|
|
|
audio, sample_rate = librosa.load(audio_file, sr=16000) |
|
|
else: |
|
|
|
|
|
audio, sample_rate = librosa.load(audio_file.name, sr=16000) |
|
|
|
|
|
|
|
|
duration = len(audio) / sample_rate |
|
|
|
|
|
|
|
|
inputs = processor(audio, sampling_rate=16000, return_tensors="pt") |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
predicted_ids = model.generate(**inputs, max_length=512) |
|
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
|
|
|
word_count = len(transcription.split()) |
|
|
file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}" |
|
|
|
|
|
return transcription, file_info, transcription |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"Error processing audio: {str(e)}" |
|
|
print(error_msg) |
|
|
return error_msg, "", "" |
|
|
|
|
|
def clear_inputs(): |
|
|
"""Clear all inputs and outputs""" |
|
|
return None, "", "", "" |
|
|
|
|
|
|
|
|
css = """ |
|
|
.gradio-container { |
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
|
|
} |
|
|
|
|
|
.main-header { |
|
|
text-align: center; |
|
|
color: #2d5aa0; |
|
|
margin-bottom: 20px; |
|
|
} |
|
|
|
|
|
.info-box { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); |
|
|
color: white; |
|
|
padding: 20px; |
|
|
border-radius: 10px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
|
|
|
.result-box { |
|
|
background-color: #f8f9fa; |
|
|
border: 1px solid #e9ecef; |
|
|
border-radius: 8px; |
|
|
padding: 15px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
def create_interface(): |
|
|
with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo: |
|
|
|
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# π€ Voxtral-Small-24B Speech Recognition |
|
|
|
|
|
Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model. |
|
|
""", |
|
|
elem_classes=["main-header"] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Accordion("βΉοΈ About this model", open=False): |
|
|
gr.Markdown( |
|
|
""" |
|
|
**Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI. |
|
|
|
|
|
- **Model**: mistralai/Voxtral-Small-24B-2507 |
|
|
- **Type**: Speech-to-Text Transformation |
|
|
- **Developer**: Mistral AI |
|
|
- **Use Case**: Audio transcription and speech recognition |
|
|
- **Supported Formats**: WAV, MP3, FLAC, M4A, OGG |
|
|
|
|
|
π‘ **Tip**: For best results, use clear audio files with minimal background noise. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
|
|
|
audio_input = gr.Audio( |
|
|
label="π Upload Audio File", |
|
|
type="filepath", |
|
|
sources=["upload", "microphone"] |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
transcribe_btn = gr.Button( |
|
|
"π Transcribe Audio", |
|
|
variant="primary", |
|
|
size="lg" |
|
|
) |
|
|
clear_btn = gr.Button( |
|
|
"ποΈ Clear", |
|
|
variant="secondary" |
|
|
) |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
transcription_output = gr.Textbox( |
|
|
label="π Transcription Result", |
|
|
lines=8, |
|
|
max_lines=15, |
|
|
placeholder="Transcribed text will appear here...", |
|
|
show_copy_button=True |
|
|
) |
|
|
|
|
|
|
|
|
info_output = gr.Textbox( |
|
|
label="π Audio Information", |
|
|
lines=1, |
|
|
placeholder="Audio details will appear here..." |
|
|
) |
|
|
|
|
|
|
|
|
download_file = gr.File( |
|
|
label="πΎ Download Transcription", |
|
|
visible=False |
|
|
) |
|
|
|
|
|
|
|
|
hidden_text = gr.Textbox(visible=False) |
|
|
|
|
|
|
|
|
transcribe_btn.click( |
|
|
fn=transcribe_audio, |
|
|
inputs=[audio_input], |
|
|
outputs=[transcription_output, info_output, hidden_text], |
|
|
show_progress=True |
|
|
) |
|
|
|
|
|
|
|
|
def update_download(text_content): |
|
|
if text_content and text_content.strip(): |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile( |
|
|
mode='w', |
|
|
delete=False, |
|
|
suffix='.txt', |
|
|
prefix='transcription_' |
|
|
) |
|
|
temp_file.write(text_content) |
|
|
temp_file.close() |
|
|
return gr.File(value=temp_file.name, visible=True) |
|
|
else: |
|
|
return gr.File(visible=False) |
|
|
|
|
|
hidden_text.change( |
|
|
fn=update_download, |
|
|
inputs=[hidden_text], |
|
|
outputs=[download_file] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_inputs, |
|
|
outputs=[audio_input, transcription_output, info_output, hidden_text] |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
|
|
|
### π οΈ Usage Instructions: |
|
|
1. **Upload**: Click on the audio input area to upload a file or use your microphone |
|
|
2. **Transcribe**: Click the "Transcribe Audio" button to process your audio |
|
|
3. **Results**: View your transcription in the text area on the right |
|
|
4. **Download**: Use the download button to save your transcription as a text file |
|
|
|
|
|
**Supported formats**: WAV, MP3, FLAC, M4A, OGG |
|
|
""" |
|
|
) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
print("Initializing Voxtral model...") |
|
|
load_model() |
|
|
|
|
|
|
|
|
demo = create_interface() |
|
|
demo.launch( |
|
|
share=True, |
|
|
show_error=True, |
|
|
server_name="0.0.0.0", |
|
|
server_port=7860 |
|
|
) |