Spaces:
Running
on
Zero
Running
on
Zero
Gamahea
commited on
Commit
Β·
7a66654
1
Parent(s):
6dfbc9e
Fix audio decoding and duplicate method issue
Browse files- Cast audio column to None to prevent automatic torchcodec decoding
- Renamed duplicate get_downloaded_datasets to get_downloaded_dataset_keys
- Remove duplicate audio column detection code
- Import Audio from datasets for proper casting
- Fixes 'list has no attribute items' error
backend/services/dataset_service.py
CHANGED
|
@@ -371,10 +371,10 @@ class DatasetService:
|
|
| 371 |
if progress_callback:
|
| 372 |
progress_callback(f"π Loading dataset from cache...")
|
| 373 |
|
| 374 |
-
from datasets import load_dataset
|
| 375 |
import librosa
|
| 376 |
hf_id = config['hf_id']
|
| 377 |
-
#
|
| 378 |
load_params = {
|
| 379 |
'path': hf_id,
|
| 380 |
'cache_dir': str(cache_dir),
|
|
@@ -394,14 +394,7 @@ class DatasetService:
|
|
| 394 |
else:
|
| 395 |
data = dataset
|
| 396 |
|
| 397 |
-
|
| 398 |
-
if max_samples:
|
| 399 |
-
total_samples = min(total_samples, max_samples)
|
| 400 |
-
|
| 401 |
-
if progress_callback:
|
| 402 |
-
progress_callback(f"π Processing {total_samples} samples...")
|
| 403 |
-
|
| 404 |
-
# Determine audio column name (varies by dataset)
|
| 405 |
audio_column = None
|
| 406 |
for col in ['audio', 'file', 'path', 'wav']:
|
| 407 |
if col in data.column_names:
|
|
@@ -411,6 +404,21 @@ class DatasetService:
|
|
| 411 |
if not audio_column:
|
| 412 |
raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
# Process samples
|
| 415 |
train_files = []
|
| 416 |
val_files = []
|
|
@@ -556,8 +564,8 @@ class DatasetService:
|
|
| 556 |
"""List all available datasets and their configurations"""
|
| 557 |
return self.DATASETS
|
| 558 |
|
| 559 |
-
def
|
| 560 |
-
"""Get list of already downloaded
|
| 561 |
downloaded = []
|
| 562 |
for dataset_key in self.DATASETS.keys():
|
| 563 |
dataset_dir = self.base_dir / dataset_key
|
|
|
|
| 371 |
if progress_callback:
|
| 372 |
progress_callback(f"π Loading dataset from cache...")
|
| 373 |
|
| 374 |
+
from datasets import load_dataset, Audio
|
| 375 |
import librosa
|
| 376 |
hf_id = config['hf_id']
|
| 377 |
+
# Load dataset WITHOUT automatic audio decoding to avoid torchcodec dependency
|
| 378 |
load_params = {
|
| 379 |
'path': hf_id,
|
| 380 |
'cache_dir': str(cache_dir),
|
|
|
|
| 394 |
else:
|
| 395 |
data = dataset
|
| 396 |
|
| 397 |
+
# Determine audio column and cast to disable automatic decoding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
audio_column = None
|
| 399 |
for col in ['audio', 'file', 'path', 'wav']:
|
| 400 |
if col in data.column_names:
|
|
|
|
| 404 |
if not audio_column:
|
| 405 |
raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
|
| 406 |
|
| 407 |
+
# Cast audio column to disable automatic decoding - we'll decode manually with librosa
|
| 408 |
+
if audio_column in data.column_names:
|
| 409 |
+
try:
|
| 410 |
+
# Remove the Audio feature to prevent automatic decoding
|
| 411 |
+
data = data.cast_column(audio_column, feature=None)
|
| 412 |
+
except:
|
| 413 |
+
pass # If casting fails, proceed anyway
|
| 414 |
+
|
| 415 |
+
total_samples = len(data)
|
| 416 |
+
if max_samples:
|
| 417 |
+
total_samples = min(total_samples, max_samples)
|
| 418 |
+
|
| 419 |
+
if progress_callback:
|
| 420 |
+
progress_callback(f"π Processing {total_samples} samples...")
|
| 421 |
+
|
| 422 |
# Process samples
|
| 423 |
train_files = []
|
| 424 |
val_files = []
|
|
|
|
| 564 |
"""List all available datasets and their configurations"""
|
| 565 |
return self.DATASETS
|
| 566 |
|
| 567 |
+
def get_downloaded_dataset_keys(self) -> List[str]:
|
| 568 |
+
"""Get list of already downloaded dataset keys (simple list)"""
|
| 569 |
downloaded = []
|
| 570 |
for dataset_key in self.DATASETS.keys():
|
| 571 |
dataset_dir = self.base_dir / dataset_key
|