Spaces:

Thanh-Lam
/

vietnamese-speaker-profiling-v2

Sleeping

Thanh-Lam commited on 8 days ago

Commit

2c4c515

1 Parent(s): 3799c1c

Use librosa instead of torchaudio for audio loading

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,8 @@ Supports: Vietnamese Wav2Vec2 and PhoWhisper encoders
 import os
 import torch
-import torchaudio
 import gradio as gr
 from pathlib import Path
 from safetensors.torch import load_file as load_safetensors
@@ -128,29 +129,15 @@ class MultiModelProfiler:
             processor = self.processors[model_name]
             is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
-            # Load audio
-            waveform, sr = torchaudio.load(audio_path)
-            # Convert to mono
-            if waveform.shape[0] > 1:
-                waveform = waveform.mean(dim=0, keepdim=True)
-            # Resample if needed
-            if sr != self.sampling_rate:
-                resampler = torchaudio.transforms.Resample(sr, self.sampling_rate)
-                waveform = resampler(waveform)
-            waveform = waveform.squeeze(0).numpy()
             # Process based on model type
             if is_whisper:
                 # Whisper requires exactly 30 seconds of audio
                 whisper_length = self.sampling_rate * 30  # 480000 samples
                 if len(waveform) < whisper_length:
-                    waveform_padded = torch.nn.functional.pad(
-                        torch.tensor(waveform),
-                        (0, whisper_length - len(waveform))
-                    ).numpy()
                 else:
                     waveform_padded = waveform[:whisper_length]

 import os
 import torch
+import librosa
+import numpy as np
 import gradio as gr
 from pathlib import Path
 from safetensors.torch import load_file as load_safetensors
             processor = self.processors[model_name]
             is_whisper = MODELS_CONFIG[model_name]["is_whisper"]
+            # Load audio using librosa (more compatible)
+            waveform, sr = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
             # Process based on model type
             if is_whisper:
                 # Whisper requires exactly 30 seconds of audio
                 whisper_length = self.sampling_rate * 30  # 480000 samples
                 if len(waveform) < whisper_length:
+                    waveform_padded = np.pad(waveform, (0, whisper_length - len(waveform)))
                 else:
                     waveform_padded = waveform[:whisper_length]