Use librosa instead of torchaudio for audio loading
Browse files
app.py
CHANGED
|
@@ -5,7 +5,8 @@ Supports: Vietnamese Wav2Vec2 and PhoWhisper encoders
|
|
| 5 |
|
| 6 |
import os
|
| 7 |
import torch
|
| 8 |
-
import
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
from pathlib import Path
|
| 11 |
from safetensors.torch import load_file as load_safetensors
|
|
@@ -128,29 +129,15 @@ class MultiModelProfiler:
|
|
| 128 |
processor = self.processors[model_name]
|
| 129 |
is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
|
| 130 |
|
| 131 |
-
# Load audio
|
| 132 |
-
waveform, sr =
|
| 133 |
-
|
| 134 |
-
# Convert to mono
|
| 135 |
-
if waveform.shape[0] > 1:
|
| 136 |
-
waveform = waveform.mean(dim=0, keepdim=True)
|
| 137 |
-
|
| 138 |
-
# Resample if needed
|
| 139 |
-
if sr != self.sampling_rate:
|
| 140 |
-
resampler = torchaudio.transforms.Resample(sr, self.sampling_rate)
|
| 141 |
-
waveform = resampler(waveform)
|
| 142 |
-
|
| 143 |
-
waveform = waveform.squeeze(0).numpy()
|
| 144 |
|
| 145 |
# Process based on model type
|
| 146 |
if is_whisper:
|
| 147 |
# Whisper requires exactly 30 seconds of audio
|
| 148 |
whisper_length = self.sampling_rate * 30 # 480000 samples
|
| 149 |
if len(waveform) < whisper_length:
|
| 150 |
-
waveform_padded =
|
| 151 |
-
torch.tensor(waveform),
|
| 152 |
-
(0, whisper_length - len(waveform))
|
| 153 |
-
).numpy()
|
| 154 |
else:
|
| 155 |
waveform_padded = waveform[:whisper_length]
|
| 156 |
|
|
|
|
| 5 |
|
| 6 |
import os
|
| 7 |
import torch
|
| 8 |
+
import librosa
|
| 9 |
+
import numpy as np
|
| 10 |
import gradio as gr
|
| 11 |
from pathlib import Path
|
| 12 |
from safetensors.torch import load_file as load_safetensors
|
|
|
|
| 129 |
processor = self.processors[model_name]
|
| 130 |
is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
|
| 131 |
|
| 132 |
+
# Load audio using librosa (more compatible)
|
| 133 |
+
waveform, sr = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
# Process based on model type
|
| 136 |
if is_whisper:
|
| 137 |
# Whisper requires exactly 30 seconds of audio
|
| 138 |
whisper_length = self.sampling_rate * 30 # 480000 samples
|
| 139 |
if len(waveform) < whisper_length:
|
| 140 |
+
waveform_padded = np.pad(waveform, (0, whisper_length - len(waveform)))
|
|
|
|
|
|
|
|
|
|
| 141 |
else:
|
| 142 |
waveform_padded = waveform[:whisper_length]
|
| 143 |
|