Gamahea commited on
Commit
275de5b
·
1 Parent(s): 7a66654

Access audio file paths directly to bypass torchcodec

Browse files

- Access underlying Arrow table data directly to avoid automatic decoding
- Get file paths from 'path' key in audio dict
- Load audio with librosa only - no torchcodec dependency
- Handle bytes data with temporary files if needed
- More robust metadata extraction from raw table data
- Should completely eliminate torchcodec errors

Files changed (1) hide show
  1. backend/services/dataset_service.py +35 -38
backend/services/dataset_service.py CHANGED
@@ -394,7 +394,7 @@ class DatasetService:
394
  else:
395
  data = dataset
396
 
397
- # Determine audio column and cast to disable automatic decoding
398
  audio_column = None
399
  for col in ['audio', 'file', 'path', 'wav']:
400
  if col in data.column_names:
@@ -404,13 +404,8 @@ class DatasetService:
404
  if not audio_column:
405
  raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
406
 
407
- # Cast audio column to disable automatic decoding - we'll decode manually with librosa
408
- if audio_column in data.column_names:
409
- try:
410
- # Remove the Audio feature to prevent automatic decoding
411
- data = data.cast_column(audio_column, feature=None)
412
- except:
413
- pass # If casting fails, proceed anyway
414
 
415
  total_samples = len(data)
416
  if max_samples:
@@ -429,41 +424,41 @@ class DatasetService:
429
 
430
  for idx in range(total_samples):
431
  try:
432
- sample = data[idx]
 
 
 
 
 
 
 
 
 
433
 
434
- # Extract audio
435
- audio_data = sample[audio_column]
436
 
437
- # Handle different audio formats
438
  if isinstance(audio_data, dict):
439
- # Check if it has 'path' or 'bytes' keys (raw data from datasets)
440
  if 'path' in audio_data and audio_data['path']:
441
- # Load from file path
442
- audio_array, sample_rate = librosa.load(audio_data['path'], sr=None)
443
  elif 'bytes' in audio_data and audio_data['bytes']:
444
- # Decode from bytes
445
- import io
446
- audio_bytes = io.BytesIO(audio_data['bytes'])
447
- audio_array, sample_rate = sf.read(audio_bytes)
448
- elif 'array' in audio_data:
449
- # Already decoded format: {'array': ndarray, 'sampling_rate': int}
450
- audio_array = audio_data['array']
451
- sample_rate = audio_data.get('sampling_rate', 22050)
452
- else:
453
- logger.warning(f"Unknown dict audio format for sample {idx}: {audio_data.keys()}")
454
- continue
455
  elif isinstance(audio_data, str):
456
- # File path - load it
457
- audio_array, sample_rate = librosa.load(audio_data, sr=None)
458
- elif isinstance(audio_data, bytes):
459
- # Raw bytes - decode it
460
- import io
461
- audio_bytes = io.BytesIO(audio_data)
462
- audio_array, sample_rate = sf.read(audio_bytes)
463
- else:
464
- logger.warning(f"Unknown audio format for sample {idx}: {type(audio_data)}")
465
  continue
466
 
 
 
 
467
  # Save audio file
468
  audio_filename = f"sample_{idx:06d}.wav"
469
  audio_path = audio_dir / audio_filename
@@ -479,9 +474,11 @@ class DatasetService:
479
  }
480
 
481
  # Extract additional metadata from dataset
482
- for key in sample.keys():
483
- if key != audio_column and not isinstance(sample[key], (dict, list)):
484
- metadata[key] = sample[key]
 
 
485
 
486
  # Add to train or val set
487
  if idx < num_train:
 
394
  else:
395
  data = dataset
396
 
397
+ # Determine audio column and disable automatic decoding
398
  audio_column = None
399
  for col in ['audio', 'file', 'path', 'wav']:
400
  if col in data.column_names:
 
404
  if not audio_column:
405
  raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
406
 
407
+ if progress_callback:
408
+ progress_callback(f"📂 Found audio column: '{audio_column}'")
 
 
 
 
 
409
 
410
  total_samples = len(data)
411
  if max_samples:
 
424
 
425
  for idx in range(total_samples):
426
  try:
427
+ # Get raw sample data WITHOUT accessing audio column (avoids torchcodec)
428
+ # Access the underlying Arrow data directly
429
+ sample_data = data._data.table.slice(idx, 1).to_pydict()
430
+
431
+ # Get the audio column data
432
+ audio_data = sample_data[audio_column][0] if audio_column in sample_data else None
433
+
434
+ if audio_data is None:
435
+ logger.warning(f"No audio data for sample {idx}")
436
+ continue
437
 
438
+ # The audio column in Parquet datasets contains file paths or bytes
439
+ audio_path_to_load = None
440
 
 
441
  if isinstance(audio_data, dict):
442
+ # Check for 'path' key which contains the cached file path
443
  if 'path' in audio_data and audio_data['path']:
444
+ audio_path_to_load = audio_data['path']
 
445
  elif 'bytes' in audio_data and audio_data['bytes']:
446
+ # Write bytes to temp file and load
447
+ import tempfile
448
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
449
+ tmp.write(audio_data['bytes'])
450
+ audio_path_to_load = tmp.name
 
 
 
 
 
 
451
  elif isinstance(audio_data, str):
452
+ # Direct file path
453
+ audio_path_to_load = audio_data
454
+
455
+ if not audio_path_to_load:
456
+ logger.warning(f"Could not find audio path for sample {idx}: {type(audio_data)}")
 
 
 
 
457
  continue
458
 
459
+ # Load audio with librosa (no torchcodec needed)
460
+ audio_array, sample_rate = librosa.load(audio_path_to_load, sr=None)
461
+
462
  # Save audio file
463
  audio_filename = f"sample_{idx:06d}.wav"
464
  audio_path = audio_dir / audio_filename
 
474
  }
475
 
476
  # Extract additional metadata from dataset
477
+ for key in sample_data.keys():
478
+ if key != audio_column and sample_data[key]:
479
+ value = sample_data[key][0]
480
+ if not isinstance(value, (dict, list)):
481
+ metadata[key] = value
482
 
483
  # Add to train or val set
484
  if idx < num_train: