Gamahea commited on
Commit
7c2cd8e
·
1 Parent(s): b8b7922

Fix training error - validate HF datasets are prepared

Browse files
backend/services/dataset_service.py CHANGED
@@ -189,7 +189,7 @@ class DatasetService:
189
  # Download dataset
190
  dataset = load_dataset(**load_params)
191
 
192
- # Save dataset info
193
  dataset_info = {
194
  'name': config['name'],
195
  'type': config['type'],
@@ -199,7 +199,14 @@ class DatasetService:
199
  'splits': list(dataset.keys()) if hasattr(dataset, 'keys') else ['default'],
200
  'num_examples': {split: len(dataset[split]) for split in dataset.keys()} if hasattr(dataset, 'keys') else len(dataset),
201
  'features': str(dataset[list(dataset.keys())[0]].features) if hasattr(dataset, 'keys') else str(dataset.features),
202
- 'path': str(dataset_dir)
 
 
 
 
 
 
 
203
  }
204
 
205
  # Save metadata
 
189
  # Download dataset
190
  dataset = load_dataset(**load_params)
191
 
192
+ # Save dataset info for LoRA training compatibility
193
  dataset_info = {
194
  'name': config['name'],
195
  'type': config['type'],
 
199
  'splits': list(dataset.keys()) if hasattr(dataset, 'keys') else ['default'],
200
  'num_examples': {split: len(dataset[split]) for split in dataset.keys()} if hasattr(dataset, 'keys') else len(dataset),
201
  'features': str(dataset[list(dataset.keys())[0]].features) if hasattr(dataset, 'keys') else str(dataset.features),
202
+ 'path': str(dataset_dir),
203
+ # Add placeholders for LoRA training service compatibility
204
+ 'train_files': [],
205
+ 'val_files': [],
206
+ 'train_metadata': [],
207
+ 'val_metadata': [],
208
+ 'prepared': False, # Indicates dataset needs preparation before training
209
+ 'hf_dataset': True # Flag that this is a HuggingFace dataset
210
  }
211
 
212
  # Save metadata
backend/services/lora_training_service.py CHANGED
@@ -262,6 +262,21 @@ class LoRATrainingService:
262
  if not dataset_info:
263
  raise ValueError(f"Dataset not found: {dataset_name}")
264
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  # Default config
266
  default_config = {
267
  'batch_size': 4,
 
262
  if not dataset_info:
263
  raise ValueError(f"Dataset not found: {dataset_name}")
264
 
265
+ # Check if dataset is from HuggingFace and needs preparation
266
+ if dataset_info.get('hf_dataset') and not dataset_info.get('prepared'):
267
+ raise ValueError(
268
+ f"Dataset '{dataset_name}' is a HuggingFace dataset that hasn't been prepared for training yet. "
269
+ f"Please use the 'User Audio Training' tab to upload and prepare your own audio files, "
270
+ f"or wait for dataset preparation features to be implemented."
271
+ )
272
+
273
+ # Validate dataset has required fields
274
+ if 'train_files' not in dataset_info or 'val_files' not in dataset_info:
275
+ raise ValueError(
276
+ f"Dataset '{dataset_name}' is missing required training files. "
277
+ f"Please use prepared datasets or upload your own audio in the 'User Audio Training' tab."
278
+ )
279
+
280
  # Default config
281
  default_config = {
282
  'batch_size': 4,