Files changed (1) hide show
  1. app.py +88 -51
app.py CHANGED
@@ -1,51 +1,86 @@
1
  import torch
2
-
3
  import gradio as gr
4
  import pytube as pt
5
  from transformers import pipeline
6
  from huggingface_hub import model_info
7
- #from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
 
 
 
 
 
 
8
 
9
  MODEL_NAME = "ihanif/wav2vec2-xls-r-300m-pashto"
10
  lang = "ps"
11
 
12
- #load pre-trained model and tokenizer
13
- #processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
14
- #model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
15
-
16
  device = 0 if torch.cuda.is_available() else "cpu"
17
- pipe = pipeline(
18
- task="automatic-speech-recognition",
19
- model=MODEL_NAME,
20
- #chunk_length_s=30,
21
- device=device,
22
- )
23
 
24
- #pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def transcribe(microphone, file_upload):
 
 
 
27
  warn_output = ""
28
- # if (microphone is not None) and (file_upload is not None):
29
- # warn_output = (
30
- # "WARNING: You've uploaded an audio file and used the microphone. "
31
- # "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
32
- # )
33
-
34
- # elif (microphone is None) and (file_upload is None):
35
- # return "ERROR: You have to either use the microphone or upload an audio file"
36
-
37
  if (microphone is None) and (file_upload is None):
38
  return "ERROR: You have to either use the microphone or upload an audio file"
39
-
40
  file = microphone if microphone is not None else file_upload
41
-
42
- text = pipe(file)["text"]
43
- #transcription = wav2vec_model(audio)["text"]
44
-
45
- return warn_output + text
46
-
47
-
48
- def _return_yt_html_embed(yt_url):
49
  video_id = yt_url.split("?v=")[-1]
50
  HTML_str = (
51
  f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
@@ -53,28 +88,29 @@ def _return_yt_html_embed(yt_url):
53
  )
54
  return HTML_str
55
 
56
-
57
  def yt_transcribe(yt_url):
58
- yt = pt.YouTube(yt_url)
59
- html_embed_str = _return_yt_html_embed(yt_url)
60
- stream = yt.streams.filter(only_audio=True)[0]
61
- stream.download(filename="audio.mp3")
62
-
63
- text = pipe("audio.mp3")["text"]
64
-
65
- return html_embed_str, text
66
-
67
-
 
 
 
 
68
  demo = gr.Blocks()
69
-
70
- examples=[["example-1.wav","example-2.wav"]]
71
- # examples=["example-1.wav"]
72
 
73
  mf_transcribe = gr.Interface(
74
  fn=transcribe,
75
  inputs=[
76
- gr.inputs.Audio(source="microphone", type="filepath", optional=True),
77
- gr.inputs.Audio(source="upload", type="filepath", optional=True),
78
  ],
79
  outputs="text",
80
  layout="horizontal",
@@ -87,9 +123,9 @@ mf_transcribe = gr.Interface(
87
  examples=examples,
88
  )
89
 
90
- yt_transcribe = gr.Interface(
91
  fn=yt_transcribe,
92
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
93
  outputs=["html", "text"],
94
  layout="horizontal",
95
  theme="huggingface",
@@ -101,6 +137,7 @@ yt_transcribe = gr.Interface(
101
  )
102
 
103
  with demo:
104
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
105
 
106
- demo.launch(enable_queue=False)
 
 
1
  import torch
 
2
  import gradio as gr
3
  import pytube as pt
4
  from transformers import pipeline
5
  from huggingface_hub import model_info
6
+ import os
7
+ import time
8
+ import requests
9
+ from requests.adapters import HTTPAdapter
10
+ from urllib3.util.retry import Retry
11
+
12
+ # Set longer timeout for huggingface_hub
13
+ os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '60'
14
 
15
  MODEL_NAME = "ihanif/wav2vec2-xls-r-300m-pashto"
16
  lang = "ps"
17
 
 
 
 
 
18
  device = 0 if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
19
 
20
+ def create_pipeline_with_retry(model_name, max_retries=3, timeout=60):
21
+ """Create pipeline with retry mechanism and custom timeout"""
22
+
23
+ # Configure requests session with retry strategy
24
+ session = requests.Session()
25
+ retry_strategy = Retry(
26
+ total=max_retries,
27
+ backoff_factor=1,
28
+ status_forcelist=[429, 500, 502, 503, 504],
29
+ )
30
+ adapter = HTTPAdapter(max_retries=retry_strategy)
31
+ session.mount("http://", adapter)
32
+ session.mount("https://", adapter)
33
+
34
+ for attempt in range(max_retries):
35
+ try:
36
+ print(f"Attempting to load model (attempt {attempt + 1}/{max_retries})...")
37
+
38
+ # Try to create the pipeline with increased timeout
39
+ pipe = pipeline(
40
+ task="automatic-speech-recognition",
41
+ model=model_name,
42
+ device=device,
43
+ # Add timeout parameter if supported
44
+ )
45
+ print("Model loaded successfully!")
46
+ return pipe
47
+
48
+ except Exception as e:
49
+ print(f"Attempt {attempt + 1} failed: {str(e)}")
50
+ if attempt < max_retries - 1:
51
+ wait_time = (attempt + 1) * 10 # Exponential backoff
52
+ print(f"Waiting {wait_time} seconds before retry...")
53
+ time.sleep(wait_time)
54
+ else:
55
+ print("All attempts failed. Please check your internet connection.")
56
+ raise e
57
+
58
+ # Initialize pipeline with retry mechanism
59
+ try:
60
+ pipe = create_pipeline_with_retry(MODEL_NAME)
61
+ except Exception as e:
62
+ print(f"Failed to load model: {e}")
63
+ # Fallback to a different model or handle gracefully
64
+ pipe = None
65
 
66
  def transcribe(microphone, file_upload):
67
+ if pipe is None:
68
+ return "ERROR: Model not loaded. Please check your internet connection and restart the application."
69
+
70
  warn_output = ""
71
+
 
 
 
 
 
 
 
 
72
  if (microphone is None) and (file_upload is None):
73
  return "ERROR: You have to either use the microphone or upload an audio file"
74
+
75
  file = microphone if microphone is not None else file_upload
76
+
77
+ try:
78
+ text = pipe(file)["text"]
79
+ return warn_output + text
80
+ except Exception as e:
81
+ return f"ERROR: Transcription failed - {str(e)}"
82
+
83
+ def return_yt_html_embed(yt_url):
84
  video_id = yt_url.split("?v=")[-1]
85
  HTML_str = (
86
  f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
 
88
  )
89
  return HTML_str
90
 
 
91
  def yt_transcribe(yt_url):
92
+ if pipe is None:
93
+ return "", "ERROR: Model not loaded. Please check your internet connection and restart the application."
94
+
95
+ try:
96
+ yt = pt.YouTube(yt_url)
97
+ html_embed_str = return_yt_html_embed(yt_url)
98
+ stream = yt.streams.filter(only_audio=True)[0]
99
+ stream.download(filename="audio.mp3")
100
+ text = pipe("audio.mp3")["text"]
101
+ return html_embed_str, text
102
+ except Exception as e:
103
+ return "", f"ERROR: YouTube transcription failed - {str(e)}"
104
+
105
+ # Create Gradio interface
106
  demo = gr.Blocks()
107
+ examples = [["example-1.wav", "example-2.wav"]]
 
 
108
 
109
  mf_transcribe = gr.Interface(
110
  fn=transcribe,
111
  inputs=[
112
+ gr.Audio(source="microphone", type="filepath", label="Microphone"),
113
+ gr.Audio(source="upload", type="filepath", label="Upload Audio"),
114
  ],
115
  outputs="text",
116
  layout="horizontal",
 
123
  examples=examples,
124
  )
125
 
126
+ yt_transcribe_interface = gr.Interface(
127
  fn=yt_transcribe,
128
+ inputs=[gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
129
  outputs=["html", "text"],
130
  layout="horizontal",
131
  theme="huggingface",
 
137
  )
138
 
139
  with demo:
140
+ gr.TabbedInterface([mf_transcribe, yt_transcribe_interface], ["Transcribe Audio", "Transcribe YouTube"])
141
 
142
+ if __name__ == "__main__":
143
+ demo.launch(enable_queue=False)