Gamahea commited on
Commit
7a66654
Β·
1 Parent(s): 6dfbc9e

Fix audio decoding and duplicate method issue

Browse files

- Cast audio column to None to prevent automatic torchcodec decoding
- Renamed duplicate get_downloaded_datasets to get_downloaded_dataset_keys
- Remove duplicate audio column detection code
- Import Audio from datasets for proper casting
- Fixes 'list has no attribute items' error

Files changed (1) hide show
  1. backend/services/dataset_service.py +20 -12
backend/services/dataset_service.py CHANGED
@@ -371,10 +371,10 @@ class DatasetService:
371
  if progress_callback:
372
  progress_callback(f"πŸ“‚ Loading dataset from cache...")
373
 
374
- from datasets import load_dataset
375
  import librosa
376
  hf_id = config['hf_id']
377
- # Disable automatic audio decoding to avoid torchcodec dependency
378
  load_params = {
379
  'path': hf_id,
380
  'cache_dir': str(cache_dir),
@@ -394,14 +394,7 @@ class DatasetService:
394
  else:
395
  data = dataset
396
 
397
- total_samples = len(data)
398
- if max_samples:
399
- total_samples = min(total_samples, max_samples)
400
-
401
- if progress_callback:
402
- progress_callback(f"πŸ“Š Processing {total_samples} samples...")
403
-
404
- # Determine audio column name (varies by dataset)
405
  audio_column = None
406
  for col in ['audio', 'file', 'path', 'wav']:
407
  if col in data.column_names:
@@ -411,6 +404,21 @@ class DatasetService:
411
  if not audio_column:
412
  raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  # Process samples
415
  train_files = []
416
  val_files = []
@@ -556,8 +564,8 @@ class DatasetService:
556
  """List all available datasets and their configurations"""
557
  return self.DATASETS
558
 
559
- def get_downloaded_datasets(self) -> List[str]:
560
- """Get list of already downloaded datasets"""
561
  downloaded = []
562
  for dataset_key in self.DATASETS.keys():
563
  dataset_dir = self.base_dir / dataset_key
 
371
  if progress_callback:
372
  progress_callback(f"πŸ“‚ Loading dataset from cache...")
373
 
374
+ from datasets import load_dataset, Audio
375
  import librosa
376
  hf_id = config['hf_id']
377
+ # Load dataset WITHOUT automatic audio decoding to avoid torchcodec dependency
378
  load_params = {
379
  'path': hf_id,
380
  'cache_dir': str(cache_dir),
 
394
  else:
395
  data = dataset
396
 
397
+ # Determine audio column and cast to disable automatic decoding
 
 
 
 
 
 
 
398
  audio_column = None
399
  for col in ['audio', 'file', 'path', 'wav']:
400
  if col in data.column_names:
 
404
  if not audio_column:
405
  raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}")
406
 
407
+ # Cast audio column to disable automatic decoding - we'll decode manually with librosa
408
+ if audio_column in data.column_names:
409
+ try:
410
+ # Remove the Audio feature to prevent automatic decoding
411
+ data = data.cast_column(audio_column, feature=None)
412
+ except:
413
+ pass # If casting fails, proceed anyway
414
+
415
+ total_samples = len(data)
416
+ if max_samples:
417
+ total_samples = min(total_samples, max_samples)
418
+
419
+ if progress_callback:
420
+ progress_callback(f"πŸ“Š Processing {total_samples} samples...")
421
+
422
  # Process samples
423
  train_files = []
424
  val_files = []
 
564
  """List all available datasets and their configurations"""
565
  return self.DATASETS
566
 
567
+ def get_downloaded_dataset_keys(self) -> List[str]:
568
+ """Get list of already downloaded dataset keys (simple list)"""
569
  downloaded = []
570
  for dataset_key in self.DATASETS.keys():
571
  dataset_dir = self.base_dir / dataset_key