Spaces:
Running
on
Zero
Running
on
Zero
Gamahea
commited on
Commit
·
275de5b
1
Parent(s):
7a66654
Access audio file paths directly to bypass torchcodec
Browse files- Access underlying Arrow table data directly to avoid automatic decoding
- Get file paths from 'path' key in audio dict
- Load audio with librosa only - no torchcodec dependency
- Handle bytes data with temporary files if needed
- More robust metadata extraction from raw table data
- Should completely eliminate torchcodec errors
backend/services/dataset_service.py
CHANGED
|
@@ -394,7 +394,7 @@ class DatasetService:
|
|
| 394 |
else:
|
| 395 |
data = dataset
|
| 396 |
|
| 397 |
-
# Determine audio column and
|
| 398 |
audio_column = None
|
| 399 |
for col in ['audio', 'file', 'path', 'wav']:
|
| 400 |
if col in data.column_names:
|
|
@@ -404,13 +404,8 @@ class DatasetService:
|
|
| 404 |
if not audio_column:
|
| 405 |
raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
try:
|
| 410 |
-
# Remove the Audio feature to prevent automatic decoding
|
| 411 |
-
data = data.cast_column(audio_column, feature=None)
|
| 412 |
-
except:
|
| 413 |
-
pass # If casting fails, proceed anyway
|
| 414 |
|
| 415 |
total_samples = len(data)
|
| 416 |
if max_samples:
|
|
@@ -429,41 +424,41 @@ class DatasetService:
|
|
| 429 |
|
| 430 |
for idx in range(total_samples):
|
| 431 |
try:
|
| 432 |
-
sample
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
-
#
|
| 435 |
-
|
| 436 |
|
| 437 |
-
# Handle different audio formats
|
| 438 |
if isinstance(audio_data, dict):
|
| 439 |
-
# Check
|
| 440 |
if 'path' in audio_data and audio_data['path']:
|
| 441 |
-
|
| 442 |
-
audio_array, sample_rate = librosa.load(audio_data['path'], sr=None)
|
| 443 |
elif 'bytes' in audio_data and audio_data['bytes']:
|
| 444 |
-
#
|
| 445 |
-
import
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
# Already decoded format: {'array': ndarray, 'sampling_rate': int}
|
| 450 |
-
audio_array = audio_data['array']
|
| 451 |
-
sample_rate = audio_data.get('sampling_rate', 22050)
|
| 452 |
-
else:
|
| 453 |
-
logger.warning(f"Unknown dict audio format for sample {idx}: {audio_data.keys()}")
|
| 454 |
-
continue
|
| 455 |
elif isinstance(audio_data, str):
|
| 456 |
-
#
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
audio_bytes = io.BytesIO(audio_data)
|
| 462 |
-
audio_array, sample_rate = sf.read(audio_bytes)
|
| 463 |
-
else:
|
| 464 |
-
logger.warning(f"Unknown audio format for sample {idx}: {type(audio_data)}")
|
| 465 |
continue
|
| 466 |
|
|
|
|
|
|
|
|
|
|
| 467 |
# Save audio file
|
| 468 |
audio_filename = f"sample_{idx:06d}.wav"
|
| 469 |
audio_path = audio_dir / audio_filename
|
|
@@ -479,9 +474,11 @@ class DatasetService:
|
|
| 479 |
}
|
| 480 |
|
| 481 |
# Extract additional metadata from dataset
|
| 482 |
-
for key in
|
| 483 |
-
if key != audio_column and
|
| 484 |
-
|
|
|
|
|
|
|
| 485 |
|
| 486 |
# Add to train or val set
|
| 487 |
if idx < num_train:
|
|
|
|
| 394 |
else:
|
| 395 |
data = dataset
|
| 396 |
|
| 397 |
+
# Determine audio column and disable automatic decoding
|
| 398 |
audio_column = None
|
| 399 |
for col in ['audio', 'file', 'path', 'wav']:
|
| 400 |
if col in data.column_names:
|
|
|
|
| 404 |
if not audio_column:
|
| 405 |
raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
|
| 406 |
|
| 407 |
+
if progress_callback:
|
| 408 |
+
progress_callback(f"📂 Found audio column: '{audio_column}'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
total_samples = len(data)
|
| 411 |
if max_samples:
|
|
|
|
| 424 |
|
| 425 |
for idx in range(total_samples):
|
| 426 |
try:
|
| 427 |
+
# Get raw sample data WITHOUT accessing audio column (avoids torchcodec)
|
| 428 |
+
# Access the underlying Arrow data directly
|
| 429 |
+
sample_data = data._data.table.slice(idx, 1).to_pydict()
|
| 430 |
+
|
| 431 |
+
# Get the audio column data
|
| 432 |
+
audio_data = sample_data[audio_column][0] if audio_column in sample_data else None
|
| 433 |
+
|
| 434 |
+
if audio_data is None:
|
| 435 |
+
logger.warning(f"No audio data for sample {idx}")
|
| 436 |
+
continue
|
| 437 |
|
| 438 |
+
# The audio column in Parquet datasets contains file paths or bytes
|
| 439 |
+
audio_path_to_load = None
|
| 440 |
|
|
|
|
| 441 |
if isinstance(audio_data, dict):
|
| 442 |
+
# Check for 'path' key which contains the cached file path
|
| 443 |
if 'path' in audio_data and audio_data['path']:
|
| 444 |
+
audio_path_to_load = audio_data['path']
|
|
|
|
| 445 |
elif 'bytes' in audio_data and audio_data['bytes']:
|
| 446 |
+
# Write bytes to temp file and load
|
| 447 |
+
import tempfile
|
| 448 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
| 449 |
+
tmp.write(audio_data['bytes'])
|
| 450 |
+
audio_path_to_load = tmp.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
elif isinstance(audio_data, str):
|
| 452 |
+
# Direct file path
|
| 453 |
+
audio_path_to_load = audio_data
|
| 454 |
+
|
| 455 |
+
if not audio_path_to_load:
|
| 456 |
+
logger.warning(f"Could not find audio path for sample {idx}: {type(audio_data)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
continue
|
| 458 |
|
| 459 |
+
# Load audio with librosa (no torchcodec needed)
|
| 460 |
+
audio_array, sample_rate = librosa.load(audio_path_to_load, sr=None)
|
| 461 |
+
|
| 462 |
# Save audio file
|
| 463 |
audio_filename = f"sample_{idx:06d}.wav"
|
| 464 |
audio_path = audio_dir / audio_filename
|
|
|
|
| 474 |
}
|
| 475 |
|
| 476 |
# Extract additional metadata from dataset
|
| 477 |
+
for key in sample_data.keys():
|
| 478 |
+
if key != audio_column and sample_data[key]:
|
| 479 |
+
value = sample_data[key][0]
|
| 480 |
+
if not isinstance(value, (dict, list)):
|
| 481 |
+
metadata[key] = value
|
| 482 |
|
| 483 |
# Add to train or val set
|
| 484 |
if idx < num_train:
|