Spaces:
Running
on
Zero
Running
on
Zero
Gamahea
commited on
Commit
·
7c2cd8e
1
Parent(s):
b8b7922
Fix training error - validate HF datasets are prepared
Browse files
backend/services/dataset_service.py
CHANGED
|
@@ -189,7 +189,7 @@ class DatasetService:
|
|
| 189 |
# Download dataset
|
| 190 |
dataset = load_dataset(**load_params)
|
| 191 |
|
| 192 |
-
# Save dataset info
|
| 193 |
dataset_info = {
|
| 194 |
'name': config['name'],
|
| 195 |
'type': config['type'],
|
|
@@ -199,7 +199,14 @@ class DatasetService:
|
|
| 199 |
'splits': list(dataset.keys()) if hasattr(dataset, 'keys') else ['default'],
|
| 200 |
'num_examples': {split: len(dataset[split]) for split in dataset.keys()} if hasattr(dataset, 'keys') else len(dataset),
|
| 201 |
'features': str(dataset[list(dataset.keys())[0]].features) if hasattr(dataset, 'keys') else str(dataset.features),
|
| 202 |
-
'path': str(dataset_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
}
|
| 204 |
|
| 205 |
# Save metadata
|
|
|
|
| 189 |
# Download dataset
|
| 190 |
dataset = load_dataset(**load_params)
|
| 191 |
|
| 192 |
+
# Save dataset info for LoRA training compatibility
|
| 193 |
dataset_info = {
|
| 194 |
'name': config['name'],
|
| 195 |
'type': config['type'],
|
|
|
|
| 199 |
'splits': list(dataset.keys()) if hasattr(dataset, 'keys') else ['default'],
|
| 200 |
'num_examples': {split: len(dataset[split]) for split in dataset.keys()} if hasattr(dataset, 'keys') else len(dataset),
|
| 201 |
'features': str(dataset[list(dataset.keys())[0]].features) if hasattr(dataset, 'keys') else str(dataset.features),
|
| 202 |
+
'path': str(dataset_dir),
|
| 203 |
+
# Add placeholders for LoRA training service compatibility
|
| 204 |
+
'train_files': [],
|
| 205 |
+
'val_files': [],
|
| 206 |
+
'train_metadata': [],
|
| 207 |
+
'val_metadata': [],
|
| 208 |
+
'prepared': False, # Indicates dataset needs preparation before training
|
| 209 |
+
'hf_dataset': True # Flag that this is a HuggingFace dataset
|
| 210 |
}
|
| 211 |
|
| 212 |
# Save metadata
|
backend/services/lora_training_service.py
CHANGED
|
@@ -262,6 +262,21 @@ class LoRATrainingService:
|
|
| 262 |
if not dataset_info:
|
| 263 |
raise ValueError(f"Dataset not found: {dataset_name}")
|
| 264 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
# Default config
|
| 266 |
default_config = {
|
| 267 |
'batch_size': 4,
|
|
|
|
| 262 |
if not dataset_info:
|
| 263 |
raise ValueError(f"Dataset not found: {dataset_name}")
|
| 264 |
|
| 265 |
+
# Check if dataset is from HuggingFace and needs preparation
|
| 266 |
+
if dataset_info.get('hf_dataset') and not dataset_info.get('prepared'):
|
| 267 |
+
raise ValueError(
|
| 268 |
+
f"Dataset '{dataset_name}' is a HuggingFace dataset that hasn't been prepared for training yet. "
|
| 269 |
+
f"Please use the 'User Audio Training' tab to upload and prepare your own audio files, "
|
| 270 |
+
f"or wait for dataset preparation features to be implemented."
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
+
# Validate dataset has required fields
|
| 274 |
+
if 'train_files' not in dataset_info or 'val_files' not in dataset_info:
|
| 275 |
+
raise ValueError(
|
| 276 |
+
f"Dataset '{dataset_name}' is missing required training files. "
|
| 277 |
+
f"Please use prepared datasets or upload your own audio in the 'User Audio Training' tab."
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
# Default config
|
| 281 |
default_config = {
|
| 282 |
'batch_size': 4,
|