Spaces:
Running
on
Zero
Running
on
Zero
Gamahea
Add dataset import & LoRA collection sync - Adds import_prepared_dataset() method to support ZIP dataset imports with name conflict resolution - Adds sync_on_startup() to download LoRAs from HF collection on app startup - Enhanced upload_lora() with training_config for proper metadata - Implements numeric suffix naming for conflicts (_1, _2, etc.)
17f5813
| """ | |
| Dataset download and preparation service | |
| Downloads curated datasets from HuggingFace for LoRA training | |
| """ | |
| import os | |
| import logging | |
| from pathlib import Path | |
| from typing import List, Dict, Optional, Callable | |
| import json | |
| from datetime import datetime | |
| logger = logging.getLogger(__name__) | |
| class DatasetService: | |
| """Service for downloading and preparing training datasets""" | |
| # Dataset configurations (Parquet format only - no loading scripts) | |
| DATASETS = { | |
| 'gtzan': { | |
| 'name': 'GTZAN Music Genre Dataset', | |
| 'type': 'music', | |
| 'hf_id': 'lewtun/music_genres_small', | |
| 'description': 'Music genre classification dataset (GTZAN-based)', | |
| 'size_gb': 1.2 | |
| }, | |
| 'fsd50k': { | |
| 'name': 'FSD50K Sound Events', | |
| 'type': 'sound_effects', | |
| 'hf_id': 'nguyenvulebinh/fsd50k', | |
| 'description': 'Freesound Dataset with 51K audio clips and 200 sound classes', | |
| 'size_gb': 30.0 | |
| }, | |
| 'librispeech': { | |
| 'name': 'LibriSpeech ASR', | |
| 'type': 'vocal', | |
| 'hf_id': 'openslr/librispeech_asr', | |
| 'description': 'LibriSpeech corpus for speech recognition', | |
| 'size_gb': 60.0 | |
| }, | |
| 'libritts': { | |
| 'name': 'LibriTTS', | |
| 'type': 'vocal', | |
| 'hf_id': 'cdminix/libritts-aligned', | |
| 'description': 'Multi-speaker English audiobook corpus for TTS', | |
| 'size_gb': 35.0 | |
| }, | |
| 'audioset_strong': { | |
| 'name': 'AudioSet Strong', | |
| 'type': 'music', | |
| 'hf_id': 'agkphysics/AudioSet', | |
| 'description': 'High-quality labeled audio events', | |
| 'size_gb': 12.0 | |
| }, | |
| 'esc50': { | |
| 'name': 'ESC-50 Environmental Sounds', | |
| 'type': 'sound_effects', | |
| 'hf_id': 'ashraq/esc50', | |
| 'description': 'Environmental sound classification with 2,000 recordings', | |
| 'size_gb': 0.6 | |
| }, | |
| 'urbansound8k': { | |
| 'name': 'UrbanSound8K', | |
| 'type': 'sound_effects', | |
| 'hf_id': 'danavery/urbansound8K', | |
| 'description': 'Urban sound classification - 8,732 labeled sound excerpts', | |
| 'size_gb': 5.6 | |
| } | |
| } | |
| def __init__(self, base_dir: str = "training_data"): | |
| """ | |
| Initialize dataset service | |
| Args: | |
| base_dir: Base directory for storing datasets | |
| """ | |
| self.base_dir = Path(base_dir) | |
| self.base_dir.mkdir(parents=True, exist_ok=True) | |
| def import_prepared_dataset(self, zip_path: str) -> Optional[str]: | |
| """ | |
| Import a prepared dataset from a ZIP file | |
| Args: | |
| zip_path: Path to the ZIP file containing dataset | |
| Returns: | |
| Dataset key if successful, None otherwise | |
| """ | |
| try: | |
| import zipfile | |
| import tempfile | |
| # Extract to temporary directory | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| zip_ref.extractall(temp_dir) | |
| temp_path = Path(temp_dir) | |
| # Look for dataset_info.json (at root or in subfolder) | |
| dataset_info_file = None | |
| if (temp_path / "dataset_info.json").exists(): | |
| dataset_info_file = temp_path / "dataset_info.json" | |
| else: | |
| # Check subfolders | |
| for subfolder in temp_path.iterdir(): | |
| if subfolder.is_dir() and (subfolder / "dataset_info.json").exists(): | |
| dataset_info_file = subfolder / "dataset_info.json" | |
| temp_path = subfolder | |
| break | |
| if not dataset_info_file: | |
| logger.error("No dataset_info.json found in ZIP file") | |
| return None | |
| # Read dataset info | |
| with open(dataset_info_file, 'r') as f: | |
| dataset_info = json.load(f) | |
| dataset_key = dataset_info.get('dataset_key', 'imported_dataset') | |
| # Check if dataset already exists, add number suffix if needed | |
| dest_path = self.base_dir / dataset_key | |
| counter = 1 | |
| original_key = dataset_key | |
| while dest_path.exists(): | |
| dataset_key = f"{original_key}_{counter}" | |
| dest_path = self.base_dir / dataset_key | |
| counter += 1 | |
| if dataset_key != original_key: | |
| logger.info(f"Dataset '{original_key}' exists, importing as '{dataset_key}'") | |
| dataset_info['dataset_key'] = dataset_key | |
| # Copy entire dataset directory | |
| import shutil | |
| shutil.copytree(temp_path, dest_path) | |
| # Update dataset_info.json with new key if renamed | |
| if dataset_key != original_key: | |
| with open(dest_path / "dataset_info.json", 'w') as f: | |
| json.dump(dataset_info, f, indent=2) | |
| logger.info(f"✅ Imported dataset: {dataset_key}") | |
| return dataset_key | |
| except Exception as e: | |
| logger.error(f"Failed to import dataset: {str(e)}", exc_info=True) | |
| return None | |
| def is_dataset_downloaded(self, dataset_key: str) -> bool: | |
| """ | |
| Check if a dataset has already been downloaded | |
| Args: | |
| dataset_key: Key identifying the dataset | |
| Returns: | |
| True if dataset exists and has metadata file, False otherwise | |
| """ | |
| dataset_dir = self.base_dir / dataset_key | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| return metadata_path.exists() | |
| def get_downloaded_datasets(self) -> Dict[str, Dict]: | |
| """ | |
| Get information about all downloaded datasets | |
| Returns: | |
| Dictionary mapping dataset keys to their metadata | |
| """ | |
| downloaded = {} | |
| for dataset_key in self.DATASETS.keys(): | |
| if self.is_dataset_downloaded(dataset_key): | |
| dataset_dir = self.base_dir / dataset_key | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| info = json.load(f) | |
| downloaded[dataset_key] = info | |
| except Exception as e: | |
| logger.warning(f"Failed to load metadata for {dataset_key}: {e}") | |
| return downloaded | |
| def get_user_datasets(self) -> Dict[str, Dict]: | |
| """Get information about user-uploaded/prepared datasets | |
| Returns: | |
| Dictionary mapping user dataset names to their metadata | |
| """ | |
| user_datasets = {} | |
| # Scan training_data directory for user datasets (prefixed with 'user_') | |
| if not self.base_dir.exists(): | |
| return user_datasets | |
| for dataset_dir in self.base_dir.iterdir(): | |
| if not dataset_dir.is_dir(): | |
| continue | |
| dataset_key = dataset_dir.name | |
| # Skip HuggingFace datasets (they're in DATASETS dict) | |
| if dataset_key in self.DATASETS: | |
| continue | |
| # Check for dataset_info.json or metadata indicating it's a user dataset | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| if metadata_path.exists(): | |
| try: | |
| with open(metadata_path, 'r') as f: | |
| info = json.load(f) | |
| # Mark as user dataset | |
| info['is_user_dataset'] = True | |
| info['dataset_key'] = dataset_key | |
| user_datasets[dataset_key] = info | |
| except Exception as e: | |
| logger.warning(f"Failed to load metadata for user dataset {dataset_key}: {e}") | |
| return user_datasets | |
| def get_all_available_datasets(self) -> Dict[str, Dict]: | |
| """Get all available datasets (both HuggingFace and user-uploaded) | |
| Returns: | |
| Dictionary mapping all dataset keys to their metadata | |
| """ | |
| all_datasets = {} | |
| # Get HuggingFace datasets | |
| all_datasets.update(self.get_downloaded_datasets()) | |
| # Get user datasets | |
| all_datasets.update(self.get_user_datasets()) | |
| return all_datasets | |
| def download_dataset(self, dataset_key: str, progress_callback=None) -> Dict: | |
| """ | |
| Download a dataset from HuggingFace | |
| Args: | |
| dataset_key: Key identifying the dataset (e.g., 'gtzan') | |
| progress_callback: Optional callback for progress updates | |
| Returns: | |
| Dictionary with dataset info and status | |
| """ | |
| try: | |
| if dataset_key not in self.DATASETS: | |
| raise ValueError(f"Unknown dataset: {dataset_key}") | |
| dataset_config = self.DATASETS[dataset_key] | |
| dataset_name = dataset_config['name'] | |
| # Check if already downloaded | |
| if self.is_dataset_downloaded(dataset_key): | |
| if progress_callback: | |
| progress_callback(f"✅ Dataset already downloaded: {dataset_name}") | |
| progress_callback(f" Use 'Prepare Datasets' section to prepare for training") | |
| # Load and return existing info | |
| dataset_dir = self.base_dir / dataset_key | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| with open(metadata_path, 'r') as f: | |
| info = json.load(f) | |
| return { | |
| 'success': True, | |
| 'dataset': dataset_key, | |
| 'info': info, | |
| 'already_downloaded': True | |
| } | |
| if progress_callback: | |
| progress_callback(f"📦 Starting download: {dataset_name}") | |
| # Show dataset size info | |
| size_gb = dataset_config.get('size_gb', 0) | |
| if size_gb > 100.0: | |
| progress_callback(f"⚠️ Large dataset: {size_gb:.1f} GB") | |
| progress_callback(f" This may take significant time to download.") | |
| elif size_gb > 10.0: | |
| progress_callback(f"ℹ️ Dataset size: ~{size_gb:.1f} GB (may take a few minutes)") | |
| else: | |
| progress_callback(f"ℹ️ Dataset size: ~{size_gb:.1f} GB") | |
| # Check if dataset is available on HuggingFace | |
| if dataset_config['hf_id'] is None: | |
| # Custom download needed | |
| return self._handle_custom_dataset(dataset_key, dataset_config, progress_callback) | |
| # Download from HuggingFace | |
| return self._download_from_huggingface(dataset_key, dataset_config, progress_callback) | |
| except Exception as e: | |
| logger.error(f"Dataset download failed: {e}", exc_info=True) | |
| return { | |
| 'success': False, | |
| 'error': str(e), | |
| 'dataset': dataset_key | |
| } | |
| def _download_from_huggingface(self, dataset_key: str, config: Dict, progress_callback=None) -> Dict: | |
| """Download dataset from HuggingFace Hub""" | |
| try: | |
| from datasets import load_dataset | |
| hf_id = config['hf_id'] | |
| dataset_dir = self.base_dir / dataset_key | |
| dataset_dir.mkdir(parents=True, exist_ok=True) | |
| if progress_callback: | |
| progress_callback(f"🔍 Loading dataset from HuggingFace Hub: {hf_id}") | |
| logger.info(f"Loading dataset: {hf_id}") | |
| # Prepare load_dataset parameters | |
| load_params = { | |
| 'path': hf_id, | |
| 'cache_dir': str(dataset_dir / "cache") | |
| } | |
| # Add optional config/split parameters | |
| if 'config' in config: | |
| load_params['name'] = config['config'] | |
| if 'split' in config: | |
| load_params['split'] = config['split'] | |
| # Download dataset | |
| dataset = load_dataset(**load_params) | |
| # Save dataset info for LoRA training compatibility | |
| dataset_info = { | |
| 'name': config['name'], | |
| 'type': config['type'], | |
| 'hf_id': hf_id, | |
| 'description': config['description'], | |
| 'size_gb': config.get('size_gb', 0), | |
| 'splits': list(dataset.keys()) if hasattr(dataset, 'keys') else ['default'], | |
| 'num_examples': {split: len(dataset[split]) for split in dataset.keys()} if hasattr(dataset, 'keys') else len(dataset), | |
| 'features': str(dataset[list(dataset.keys())[0]].features) if hasattr(dataset, 'keys') else str(dataset.features), | |
| 'path': str(dataset_dir), | |
| # Add placeholders for LoRA training service compatibility | |
| 'train_files': [], | |
| 'val_files': [], | |
| 'train_metadata': [], | |
| 'val_metadata': [], | |
| 'prepared': False, # Indicates dataset needs preparation before training | |
| 'hf_dataset': True # Flag that this is a HuggingFace dataset | |
| } | |
| # Save metadata | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| with open(metadata_path, 'w') as f: | |
| json.dump(dataset_info, f, indent=2) | |
| if progress_callback: | |
| progress_callback(f"✅ Downloaded {config['name']}") | |
| if hasattr(dataset, 'keys'): | |
| for split in dataset.keys(): | |
| progress_callback(f" {split}: {len(dataset[split]):,} samples") | |
| else: | |
| progress_callback(f" Total: {len(dataset):,} samples") | |
| logger.info(f"Dataset downloaded successfully: {dataset_key}") | |
| return { | |
| 'success': True, | |
| 'dataset': dataset_key, | |
| 'info': dataset_info | |
| } | |
| except ImportError: | |
| error_msg = "HuggingFace datasets library not installed. Install with: pip install datasets" | |
| logger.error(error_msg) | |
| if progress_callback: | |
| progress_callback(f"❌ {error_msg}") | |
| return { | |
| 'success': False, | |
| 'error': error_msg, | |
| 'dataset': dataset_key | |
| } | |
| except Exception as e: | |
| error_msg = f"Failed to download {config['name']}: {str(e)}" | |
| logger.error(error_msg, exc_info=True) | |
| # Provide helpful error messages | |
| if progress_callback: | |
| progress_callback(f"❌ {error_msg}") | |
| if "doesn't exist" in str(e).lower() or "not found" in str(e).lower(): | |
| progress_callback(f" 💡 Dataset '{hf_id}' not found on HuggingFace Hub") | |
| progress_callback(f" Check: https://huggingface.co/datasets/{hf_id}") | |
| elif "connection" in str(e).lower() or "timeout" in str(e).lower(): | |
| progress_callback(f" 💡 Network issue - check your internet connection") | |
| elif "permission" in str(e).lower() or "access" in str(e).lower(): | |
| progress_callback(f" 💡 Dataset may require authentication or have access restrictions") | |
| progress_callback(f"❌ {error_msg}") | |
| return { | |
| 'success': False, | |
| 'error': error_msg, | |
| 'dataset': dataset_key | |
| } | |
| def prepare_dataset_for_training( | |
| self, | |
| dataset_key: str, | |
| train_val_split: float = 0.8, | |
| max_samples: Optional[int] = None, | |
| progress_callback: Optional[Callable] = None | |
| ) -> Dict: | |
| """ | |
| Prepare a downloaded HuggingFace dataset for LoRA training. | |
| Extracts audio files, creates metadata, and splits into train/val sets. | |
| Args: | |
| dataset_key: Key identifying the dataset (e.g., 'gtzan') | |
| train_val_split: Fraction of data to use for training (default: 0.8) | |
| max_samples: Maximum number of samples to prepare (None = all) | |
| progress_callback: Optional callback for progress updates | |
| Returns: | |
| Dictionary with preparation results | |
| """ | |
| try: | |
| from datasets import load_from_disk | |
| import soundfile as sf | |
| import numpy as np | |
| if progress_callback: | |
| progress_callback(f"🔧 Preparing dataset: {dataset_key}") | |
| # Check if dataset exists | |
| if dataset_key not in self.DATASETS: | |
| raise ValueError(f"Unknown dataset: {dataset_key}") | |
| config = self.DATASETS[dataset_key] | |
| dataset_dir = self.base_dir / dataset_key | |
| cache_dir = dataset_dir / "cache" | |
| audio_dir = dataset_dir / "audio" | |
| audio_dir.mkdir(parents=True, exist_ok=True) | |
| # Load dataset info | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| if not metadata_path.exists(): | |
| raise ValueError(f"Dataset not downloaded yet. Please download {dataset_key} first.") | |
| with open(metadata_path, 'r') as f: | |
| dataset_info = json.load(f) | |
| if dataset_info.get('prepared'): | |
| if progress_callback: | |
| progress_callback(f"✅ Dataset already prepared!") | |
| return {'success': True, 'dataset': dataset_key, 'already_prepared': True} | |
| # Load HuggingFace dataset from cache | |
| if progress_callback: | |
| progress_callback(f"📂 Loading dataset from cache...") | |
| from datasets import load_dataset, Audio | |
| import librosa | |
| hf_id = config['hf_id'] | |
| # Load dataset WITHOUT automatic audio decoding to avoid torchcodec dependency | |
| load_params = { | |
| 'path': hf_id, | |
| 'cache_dir': str(cache_dir), | |
| } | |
| if 'config' in config: | |
| load_params['name'] = config['config'] | |
| if 'split' in config: | |
| load_params['split'] = config['split'] | |
| dataset = load_dataset(**load_params) | |
| # Get the appropriate split | |
| if hasattr(dataset, 'keys'): | |
| # Use 'train' split if available, otherwise first available split | |
| split_name = 'train' if 'train' in dataset.keys() else list(dataset.keys())[0] | |
| data = dataset[split_name] | |
| else: | |
| data = dataset | |
| # Determine audio column and disable automatic decoding | |
| audio_column = None | |
| for col in ['audio', 'file', 'path', 'wav']: | |
| if col in data.column_names: | |
| audio_column = col | |
| break | |
| if not audio_column: | |
| raise ValueError(f"Could not find audio column in dataset. Available columns: {data.column_names}") | |
| if progress_callback: | |
| progress_callback(f"📂 Found audio column: '{audio_column}'") | |
| total_samples = len(data) | |
| if max_samples: | |
| total_samples = min(total_samples, max_samples) | |
| if progress_callback: | |
| progress_callback(f"📊 Processing {total_samples} samples...") | |
| # Process samples | |
| train_files = [] | |
| val_files = [] | |
| train_metadata = [] | |
| val_metadata = [] | |
| num_train = int(total_samples * train_val_split) | |
| for idx in range(total_samples): | |
| try: | |
| # Get raw sample data WITHOUT accessing audio column (avoids torchcodec) | |
| # Access the underlying Arrow data directly | |
| sample_data = data._data.table.slice(idx, 1).to_pydict() | |
| # Get the audio column data | |
| audio_data = sample_data[audio_column][0] if audio_column in sample_data else None | |
| if audio_data is None: | |
| logger.warning(f"No audio data for sample {idx}") | |
| continue | |
| # The audio column in Parquet datasets contains file paths or bytes | |
| audio_path_to_load = None | |
| if isinstance(audio_data, dict): | |
| # Check for 'path' key which contains the cached file path | |
| if 'path' in audio_data and audio_data['path']: | |
| audio_path_to_load = audio_data['path'] | |
| elif 'bytes' in audio_data and audio_data['bytes']: | |
| # Write bytes to temp file and load | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp: | |
| tmp.write(audio_data['bytes']) | |
| audio_path_to_load = tmp.name | |
| elif isinstance(audio_data, str): | |
| # Direct file path | |
| audio_path_to_load = audio_data | |
| if not audio_path_to_load: | |
| logger.warning(f"Could not find audio path for sample {idx}: {type(audio_data)}") | |
| continue | |
| # Load audio with librosa (no torchcodec needed) | |
| audio_array, sample_rate = librosa.load(audio_path_to_load, sr=None) | |
| # Save audio file | |
| audio_filename = f"sample_{idx:06d}.wav" | |
| audio_path = audio_dir / audio_filename | |
| sf.write(audio_path, audio_array, sample_rate) | |
| # Create metadata | |
| metadata = { | |
| 'audio_file': str(audio_path), | |
| 'sample_rate': sample_rate, | |
| 'duration': len(audio_array) / sample_rate, | |
| 'dataset': dataset_key, | |
| 'index': idx | |
| } | |
| # Extract additional metadata from dataset | |
| for key in sample_data.keys(): | |
| if key != audio_column and sample_data[key]: | |
| value = sample_data[key][0] | |
| if not isinstance(value, (dict, list)): | |
| metadata[key] = value | |
| # Add to train or val set | |
| if idx < num_train: | |
| train_files.append(str(audio_path)) | |
| train_metadata.append(metadata) | |
| else: | |
| val_files.append(str(audio_path)) | |
| val_metadata.append(metadata) | |
| # Progress update | |
| if progress_callback and (idx + 1) % 50 == 0: | |
| progress_callback(f" Processed {idx + 1}/{total_samples} samples...") | |
| except Exception as e: | |
| logger.warning(f"Error processing sample {idx}: {str(e)}") | |
| continue | |
| # Update dataset_info.json with training-ready format | |
| dataset_info.update({ | |
| 'train_files': train_files, | |
| 'val_files': val_files, | |
| 'train_metadata': train_metadata, | |
| 'val_metadata': val_metadata, | |
| 'prepared': True, | |
| 'preparation_date': datetime.now().isoformat(), | |
| 'num_train_samples': len(train_files), | |
| 'num_val_samples': len(val_files), | |
| 'train_val_split': train_val_split | |
| }) | |
| # Save updated metadata | |
| with open(metadata_path, 'w') as f: | |
| json.dump(dataset_info, f, indent=2) | |
| if progress_callback: | |
| progress_callback(f"✅ Dataset prepared successfully!") | |
| progress_callback(f" Training samples: {len(train_files)}") | |
| progress_callback(f" Validation samples: {len(val_files)}") | |
| progress_callback(f" Audio files saved to: {audio_dir}") | |
| logger.info(f"Dataset {dataset_key} prepared: {len(train_files)} train, {len(val_files)} val") | |
| return { | |
| 'success': True, | |
| 'dataset': dataset_key, | |
| 'num_train': len(train_files), | |
| 'num_val': len(val_files), | |
| 'audio_dir': str(audio_dir) | |
| } | |
| except Exception as e: | |
| error_msg = f"Failed to prepare dataset {dataset_key}: {str(e)}" | |
| logger.error(error_msg, exc_info=True) | |
| if progress_callback: | |
| progress_callback(f"❌ {error_msg}") | |
| return { | |
| 'success': False, | |
| 'error': error_msg, | |
| 'dataset': dataset_key | |
| } | |
| def _handle_custom_dataset(self, dataset_key: str, config: Dict, progress_callback=None) -> Dict: | |
| """Handle datasets that require custom download""" | |
| if progress_callback: | |
| progress_callback( | |
| f"⚠️ {config['name']} requires manual download\n" | |
| f" Visit: {config.get('custom_url', 'N/A')}\n" | |
| f" Place files in: training_data/{dataset_key}/" | |
| ) | |
| return { | |
| 'success': False, | |
| 'manual_download_required': True, | |
| 'dataset': dataset_key, | |
| 'url': config.get('custom_url'), | |
| 'info': config | |
| } | |
| def list_available_datasets(self) -> Dict[str, Dict]: | |
| """List all available datasets and their configurations""" | |
| return self.DATASETS | |
| def get_downloaded_dataset_keys(self) -> List[str]: | |
| """Get list of already downloaded dataset keys (simple list)""" | |
| downloaded = [] | |
| for dataset_key in self.DATASETS.keys(): | |
| dataset_dir = self.base_dir / dataset_key | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| if metadata_path.exists(): | |
| downloaded.append(dataset_key) | |
| return downloaded | |
| def prepare_for_training(self, dataset_key: str) -> Dict: | |
| """ | |
| Prepare downloaded dataset for LoRA training | |
| Args: | |
| dataset_key: Dataset to prepare | |
| Returns: | |
| Dictionary with prepared dataset info | |
| """ | |
| try: | |
| dataset_dir = self.base_dir / dataset_key | |
| metadata_path = dataset_dir / 'dataset_info.json' | |
| if not metadata_path.exists(): | |
| raise ValueError(f"Dataset not downloaded: {dataset_key}") | |
| with open(metadata_path) as f: | |
| dataset_info = json.load(f) | |
| # Create prepared dataset directory | |
| prepared_dir = dataset_dir / "prepared" | |
| prepared_dir.mkdir(parents=True, exist_ok=True) | |
| logger.info(f"Dataset {dataset_key} ready for training") | |
| return { | |
| 'success': True, | |
| 'dataset': dataset_key, | |
| 'path': str(prepared_dir), | |
| 'info': dataset_info | |
| } | |
| except Exception as e: | |
| logger.error(f"Dataset preparation failed: {e}") | |
| return { | |
| 'success': False, | |
| 'error': str(e), | |
| 'dataset': dataset_key | |
| } | |