Spaces:

mmdbes
/

Farsi_Voice_To_Text

Running

App Files Files Community

mmdbes commited on Aug 21

Commit

0b0f809

verified ·

1 Parent(s): 436c9ab

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -67

app.py CHANGED Viewed

@@ -1,79 +1,85 @@
 # app.py
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-import gradio as gr
 import os
-# --- 1. Model Configuration and Loading ---
-# This part runs only once when the app starts.
-print("--- Setting up for CPU ---")
-device = "cpu"
-torch_dtype = torch.float32  # Use float32 for CPU
-model_id = "vhdm/whisper-large-fa-v1"
-print("--- Loading model and processor ---")
-# Load the model and processor
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id,
-    torch_dtype=torch_dtype,
-    low_cpu_mem_usage=True,
-    use_safetensors=True  # Safetensors is generally preferred
-)
-processor = AutoProcessor.from_pretrained(model_id)
-# Create the pipeline
-print("--- Creating transcription pipeline ---")
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    max_new_tokens=128,
-    torch_dtype=torch_dtype,
-    device=device,
-)
-print("--- Setup complete. Gradio app is ready. ---")
-# --- 2. The Transcription Function ---
-# This function is called every time a user uploads a file.
-def transcribe_audio(audio_filepath):
-    """
-    Takes an audio file path, transcribes it, and returns the text.
-    """
-    if audio_filepath is None:
-        return "Please upload an audio file first."
-    print(f"Processing file: {audio_filepath}")
-    result = pipe(audio_filepath, return_timestamps=True)
-    transcription = result["text"]
-    print(f"Transcription result: {transcription}")
-    return transcription
-# --- 3. Gradio Web Interface ---
-# Define the title and description for the web app
-title = "Whisper Persian ASR 🇮🇷"
-description = """
-This is a demo for the `vhdm/whisper-large-fa-v1` model for automatic speech recognition (ASR) in Persian.
-<br>
-Upload your audio file (MP3, WAV, etc.) or record directly from your microphone and click 'Submit' to see the transcription.
-"""
-# Create the Gradio interface
-iface = gr.Interface(
-    fn=transcribe_audio,
-    inputs=gr.Audio(type="filepath", label="Upload or Record Persian Audio"),
-    outputs=gr.Textbox(label="Transcription Result"),
-    title=title,
-    description=description,
-    examples=[["example.wav"]] # Optional: add an example file
 )
-# Launch the app
-iface.launch()

 # app.py
+import streamlit as st
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import os
+# --- 1. تنظیمات اولیه و عنوان صفحه ---
+st.set_page_config(
+    page_title="Persian Whisper ASR",
+    page_icon="🇮🇷🎙️",
+    layout="centered"
+)
+st.title("🇮🇷 اپلیکیشن تبدیل گفتار به نوشتار فارسی (Whisper)")
+st.markdown("""
+این یک نسخه نمایشی برای مدل **`vhdm/whisper-large-fa-v1`** است.
+فایل صوتی خود را آپلود کنید تا متن آن را مشاهده نمایید.
+""")
+# --- 2. بارگذاری مدل (با کش کردن برای سرعت بیشتر) ---
+# این دکوراتور به Streamlit می‌گوید که مدل را فقط یک بار بارگذاری کند.
+@st.cache_resource
+def load_model():
+    """Loads and caches the Whisper model and processor."""
+    print("--- Loading model and processor for the first time ---")
+    device = "cpu"
+    torch_dtype = torch.float32
+    model_id = "vhdm/whisper-large-fa-v1"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+    )
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        max_new_tokens=128,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    print("--- Model loaded successfully ---")
+    return pipe
+# مدل را بارگذاری می‌کنیم
+transcription_pipe = load_model()
+# --- 3. بخش آپلود فایل و پردازش ---
+st.header("فایل صوتی خود را آپلود کنید")
+uploaded_file = st.file_uploader(
+    "یک فایل صوتی انتخاب کنید (WAV, MP3, FLAC)...",
+    type=["wav", "mp3", "m4a", "flac"]
 )
+if uploaded_file is not None:
+    # نمایش فایل صوتی
+    st.audio(uploaded_file, format='audio/wav')
+    # دکمه برای شروع پردازش
+    if st.button("شروع رونویسی"):
+        # برای پردازش، فایل را به صورت موقت ذخیره می‌کنیم
+        temp_file_path = f"./temp_{uploaded_file.name}"
+        with open(temp_file_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        # نمایش پیام در حال پردازش
+        with st.spinner("در حال پردازش فایل صوتی... لطفاً صبر کنید."):
+            result = transcription_pipe(temp_file_path)
+            transcription = result["text"]
+        # نمایش نتیجه
+        st.success("پردازش با موفقیت انجام شد!")
+        st.subheader("متن رونویسی شده:")
+        st.write(transcription)
+        # حذف فایل موقت
+        os.remove(temp_file_path)