Spaces:

ravinder024
/

testspace-for-voice-to-text

Running

App Files Files Community

testspace-for-voice-to-text / app.py

ravinder024

v1 app.py added

db214be verified about 2 months ago

raw

history blame contribute delete

8.57 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	import librosa
	import numpy as np
	import os
	import tempfile
	from datetime import datetime

	# Global variables for model and processor
	processor = None
	model = None

	def load_model():
	"""Load the Voxtral model and processor"""
	global processor, model

	if processor is not None and model is not None:
	return processor, model

	try:
	model_name = "mistralai/Voxtral-Small-24B-2507"

	print("Loading Voxtral model... This may take a few minutes.")
	processor = AutoProcessor.from_pretrained(model_name)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto",
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	print("Model loaded successfully!")
	return processor, model

	except Exception as e:
	print(f"Error loading model: {str(e)}")
	return None, None

	def transcribe_audio(audio_file):
	"""Process audio file and return transcription"""
	if audio_file is None:
	return "Please upload an audio file.", "", ""

	try:
	# Load model if not already loaded
	global processor, model
	if processor is None or model is None:
	processor, model = load_model()

	if processor is None or model is None:
	return "Error: Model failed to load. Please try again.", "", ""

	# Load audio file
	if isinstance(audio_file, str):
	# If it's a file path
	audio, sample_rate = librosa.load(audio_file, sr=16000)
	else:
	# If it's uploaded file data
	audio, sample_rate = librosa.load(audio_file.name, sr=16000)

	# Calculate duration
	duration = len(audio) / sample_rate

	# Process with the model
	inputs = processor(audio, sampling_rate=16000, return_tensors="pt")

	# Move inputs to the same device as model
	if torch.cuda.is_available():
	inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

	with torch.no_grad():
	predicted_ids = model.generate(**inputs, max_length=512)
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

	# Generate file info
	word_count = len(transcription.split())
	file_info = f"Duration: {duration:.2f} seconds \| Words: {word_count} \| Processed: {datetime.now().strftime('%H:%M:%S')}"

	return transcription, file_info, transcription # Return transcription twice for download

	except Exception as e:
	error_msg = f"Error processing audio: {str(e)}"
	print(error_msg)
	return error_msg, "", ""

	def clear_inputs():
	"""Clear all inputs and outputs"""
	return None, "", "", ""

	# Custom CSS for better styling
	css = """
	.gradio-container {
	font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
	}

	.main-header {
	text-align: center;
	color: #2d5aa0;
	margin-bottom: 20px;
	}

	.info-box {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	padding: 20px;
	border-radius: 10px;
	margin: 10px 0;
	}

	.result-box {
	background-color: #f8f9fa;
	border: 1px solid #e9ecef;
	border-radius: 8px;
	padding: 15px;
	margin: 10px 0;
	}
	"""

	# Create the Gradio interface
	def create_interface():
	with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:

	# Header
	gr.Markdown(
	"""
	# 🎤 Voxtral-Small-24B Speech Recognition

	Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
	""",
	elem_classes=["main-header"]
	)

	# Model info
	with gr.Accordion("ℹ️ About this model", open=False):
	gr.Markdown(
	"""
	Voxtral-Small-24B-2507 is a speech-to-text model developed by Mistral AI.

	- Model: mistralai/Voxtral-Small-24B-2507
	- Type: Speech-to-Text Transformation
	- Developer: Mistral AI
	- Use Case: Audio transcription and speech recognition
	- Supported Formats: WAV, MP3, FLAC, M4A, OGG

	💡 Tip: For best results, use clear audio files with minimal background noise.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	# Audio input
	audio_input = gr.Audio(
	label="📁 Upload Audio File",
	type="filepath",
	sources=["upload", "microphone"]
	)

	# Control buttons
	with gr.Row():
	transcribe_btn = gr.Button(
	"🚀 Transcribe Audio",
	variant="primary",
	size="lg"
	)
	clear_btn = gr.Button(
	"🗑️ Clear",
	variant="secondary"
	)

	with gr.Column(scale=1):
	# Results
	transcription_output = gr.Textbox(
	label="📝 Transcription Result",
	lines=8,
	max_lines=15,
	placeholder="Transcribed text will appear here...",
	show_copy_button=True
	)

	# File info
	info_output = gr.Textbox(
	label="📊 Audio Information",
	lines=1,
	placeholder="Audio details will appear here..."
	)

	# Download option
	download_file = gr.File(
	label="💾 Download Transcription",
	visible=False
	)

	# Hidden textbox for file content (for download)
	hidden_text = gr.Textbox(visible=False)

	# Event handlers
	transcribe_btn.click(
	fn=transcribe_audio,
	inputs=[audio_input],
	outputs=[transcription_output, info_output, hidden_text],
	show_progress=True
	)

	# Update download file when transcription is complete
	def update_download(text_content):
	if text_content and text_content.strip():
	# Create a temporary file with the transcription
	temp_file = tempfile.NamedTemporaryFile(
	mode='w',
	delete=False,
	suffix='.txt',
	prefix='transcription_'
	)
	temp_file.write(text_content)
	temp_file.close()
	return gr.File(value=temp_file.name, visible=True)
	else:
	return gr.File(visible=False)

	hidden_text.change(
	fn=update_download,
	inputs=[hidden_text],
	outputs=[download_file]
	)

	clear_btn.click(
	fn=clear_inputs,
	outputs=[audio_input, transcription_output, info_output, hidden_text]
	)

	# Footer
	gr.Markdown(
	"""
	---

	### 🛠️ Usage Instructions:
	1. Upload: Click on the audio input area to upload a file or use your microphone
	2. Transcribe: Click the "Transcribe Audio" button to process your audio
	3. Results: View your transcription in the text area on the right
	4. Download: Use the download button to save your transcription as a text file

	Supported formats: WAV, MP3, FLAC, M4A, OGG
	"""
	)

	return demo

	# Initialize and launch the app
	if __name__ == "__main__":
	# Pre-load the model when the app starts
	print("Initializing Voxtral model...")
	load_model()

	# Create and launch the interface
	demo = create_interface()
	demo.launch(
	share=True,
	show_error=True,
	server_name="0.0.0.0",
	server_port=7860
	)