Spaces:

Gamahea
/

lemm-test-100

Running on Zero

App Files Files Community

Gamahea commited on 8 days ago

Commit

6d5bfcd

1 Parent(s): 1329490

Add persistent dataset detection for HF and user datasets across sessions

Browse files

Files changed (2) hide show

app.py +59 -37
backend/services/dataset_service.py +55 -0

app.py CHANGED Viewed

@@ -914,15 +914,12 @@ def get_dataset_choices_with_status():
         dataset_service = DatasetService()
         downloaded = dataset_service.get_downloaded_datasets()
         # Dataset display mappings
         dataset_display_map = {
             "gtzan": "GTZAN Music Genre (1000 tracks, 10 genres)",
             "fsd50k": "FSD50K Sound Events (51K clips, 200 classes)",
-            "common_voice": "Common Voice English (crowdsourced speech)",
-            "jamendo": "MTG-Jamendo (55k tracks, music tagging)",
-            "musiccaps": "MusicCaps (5.5k clips with descriptions)",
-            "fleurs": "FLEURS English Speech (multi-speaker)",
             "librispeech": "LibriSpeech ASR (speech recognition)",
             "libritts": "LibriTTS (audiobooks for TTS)",
             "audioset_strong": "AudioSet Strong (labeled audio events)",
@@ -937,6 +934,7 @@ def get_dataset_choices_with_status():
         music_keys = ["gtzan"]
         vocal_keys = ["librispeech", "libritts", "audioset_strong", "esc50", "urbansound8k", "fsd50k"]
         for key in music_keys:
             display_name = dataset_display_map.get(key, key)
             if key in downloaded:
@@ -961,6 +959,17 @@ def get_dataset_choices_with_status():
             else:
                 vocal_choices.append(display_name)
         return music_choices, vocal_choices, prepare_choices
     except Exception as e:
@@ -1191,10 +1200,20 @@ def prepare_user_training_dataset(audio_files, metadata_table, split_clips, sepa
             return "❌ No audio files uploaded"
         from backend.services.audio_analysis_service import AudioAnalysisService
-        from backend.services.lora_training_service import LoRATrainingService
         analyzer = AudioAnalysisService()
-        lora_service = LoRATrainingService()
         # Process audio files
         processed_files = []
@@ -1215,39 +1234,42 @@ def prepare_user_training_dataset(audio_files, metadata_table, split_clips, sepa
                 # Analyze if no metadata
                 file_metadata = analyzer.analyze_audio(audio_file.name)
-            # Split into clips if requested
-            if split_clips:
-                clip_paths = analyzer.split_audio_to_clips(
-                    audio_file.name,
-                    "training_data/user_uploads/clips",
-                    metadata=file_metadata
-                )
-                processed_files.extend(clip_paths)
-                processed_metadata.extend([file_metadata] * len(clip_paths))
-            else:
-                processed_files.append(audio_file.name)
-                processed_metadata.append(file_metadata)
-            # Separate stems if requested
-            if separate_stems:
-                stem_paths = analyzer.separate_vocal_stems(
-                    audio_file.name,
-                    "training_data/user_uploads/stems"
-                )
-                # Use vocals only for vocal training
-                if 'vocals' in stem_paths:
-                    processed_files.append(stem_paths['vocals'])
-                    processed_metadata.append({**file_metadata, 'type': 'vocal'})
-        # Prepare dataset
-        dataset_name = f"user_dataset_{int(time.time())}"
-        dataset_info = lora_service.prepare_dataset(
-            dataset_name,
-            processed_files,
-            processed_metadata
-        )
-        return f"✅ Prepared dataset '{dataset_name}' with {dataset_info['num_samples']} samples ({dataset_info['num_train']} train, {dataset_info['num_val']} val)"
     except Exception as e:
         logger.error(f"Dataset preparation failed: {e}")

         dataset_service = DatasetService()
         downloaded = dataset_service.get_downloaded_datasets()
+        user_datasets = dataset_service.get_user_datasets()
         # Dataset display mappings
         dataset_display_map = {
             "gtzan": "GTZAN Music Genre (1000 tracks, 10 genres)",
             "fsd50k": "FSD50K Sound Events (51K clips, 200 classes)",
             "librispeech": "LibriSpeech ASR (speech recognition)",
             "libritts": "LibriTTS (audiobooks for TTS)",
             "audioset_strong": "AudioSet Strong (labeled audio events)",
         music_keys = ["gtzan"]
         vocal_keys = ["librispeech", "libritts", "audioset_strong", "esc50", "urbansound8k", "fsd50k"]
+        # Add HuggingFace datasets
         for key in music_keys:
             display_name = dataset_display_map.get(key, key)
             if key in downloaded:
             else:
                 vocal_choices.append(display_name)
+        # Add user-uploaded datasets
+        for key, info in user_datasets.items():
+            dataset_name = info.get('dataset_name', key)
+            num_samples = info.get('num_train_samples', 0) + info.get('num_val_samples', 0)
+            display_name = f"👤 {dataset_name} ({num_samples} samples)"
+            if info.get('prepared'):
+                vocal_choices.append(f"✅ {display_name} [User Dataset - Prepared]")
+            else:
+                vocal_choices.append(f"📥 {display_name} [User Dataset]")
         return music_choices, vocal_choices, prepare_choices
     except Exception as e:
             return "❌ No audio files uploaded"
         from backend.services.audio_analysis_service import AudioAnalysisService
+        from backend.services.dataset_service import DatasetService
+        from pathlib import Path
+        import shutil
+        import json
         analyzer = AudioAnalysisService()
+        dataset_service = DatasetService()
+        # Create persistent user dataset directory
+        timestamp = int(time.time())
+        dataset_name = f"user_dataset_{timestamp}"
+        dataset_dir = Path("training_data") / dataset_name
+        audio_dir = dataset_dir / "audio"
+        audio_dir.mkdir(parents=True, exist_ok=True)
         # Process audio files
         processed_files = []
                 # Analyze if no metadata
                 file_metadata = analyzer.analyze_audio(audio_file.name)
+            # Copy file to persistent storage
+            dest_filename = f"sample_{i:06d}.wav"
+            dest_path = audio_dir / dest_filename
+            shutil.copy2(audio_file.name, dest_path)
+            processed_files.append(str(dest_path))
+            processed_metadata.append(file_metadata)
+        # Split into train/val
+        num_train = int(len(processed_files) * 0.9)
+        train_files = processed_files[:num_train]
+        val_files = processed_files[num_train:]
+        train_metadata = processed_metadata[:num_train]
+        val_metadata = processed_metadata[num_train:]
+        # Save dataset metadata
+        dataset_info = {
+            'dataset_name': dataset_name,
+            'dataset_key': dataset_name,
+            'is_user_dataset': True,
+            'created_date': datetime.now().isoformat(),
+            'prepared': True,
+            'num_train_samples': len(train_files),
+            'num_val_samples': len(val_files),
+            'train_files': train_files,
+            'val_files': val_files,
+            'train_metadata': train_metadata,
+            'val_metadata': val_metadata,
+            'train_val_split': 0.9
+        }
+        metadata_path = dataset_dir / 'dataset_info.json'
+        with open(metadata_path, 'w') as f:
+            json.dump(dataset_info, f, indent=2)
+        return f"✅ Prepared user dataset '{dataset_name}' with {len(processed_files)} samples ({len(train_files)} train, {len(val_files)} val)\n📁 Saved to: {dataset_dir}"
     except Exception as e:
         logger.error(f"Dataset preparation failed: {e}")

backend/services/dataset_service.py CHANGED Viewed

@@ -113,6 +113,61 @@ class DatasetService:
                     logger.warning(f"Failed to load metadata for {dataset_key}: {e}")
         return downloaded
     def download_dataset(self, dataset_key: str, progress_callback=None) -> Dict:
         """

                     logger.warning(f"Failed to load metadata for {dataset_key}: {e}")
         return downloaded
+    def get_user_datasets(self) -> Dict[str, Dict]:
+        """Get information about user-uploaded/prepared datasets
+        Returns:
+            Dictionary mapping user dataset names to their metadata
+        """
+        user_datasets = {}
+        # Scan training_data directory for user datasets (prefixed with 'user_')
+        if not self.base_dir.exists():
+            return user_datasets
+        for dataset_dir in self.base_dir.iterdir():
+            if not dataset_dir.is_dir():
+                continue
+            dataset_key = dataset_dir.name
+            # Skip HuggingFace datasets (they're in DATASETS dict)
+            if dataset_key in self.DATASETS:
+                continue
+            # Check for dataset_info.json or metadata indicating it's a user dataset
+            metadata_path = dataset_dir / 'dataset_info.json'
+            if metadata_path.exists():
+                try:
+                    with open(metadata_path, 'r') as f:
+                        info = json.load(f)
+                    # Mark as user dataset
+                    info['is_user_dataset'] = True
+                    info['dataset_key'] = dataset_key
+                    user_datasets[dataset_key] = info
+                except Exception as e:
+                    logger.warning(f"Failed to load metadata for user dataset {dataset_key}: {e}")
+        return user_datasets
+    def get_all_available_datasets(self) -> Dict[str, Dict]:
+        """Get all available datasets (both HuggingFace and user-uploaded)
+        Returns:
+            Dictionary mapping all dataset keys to their metadata
+        """
+        all_datasets = {}
+        # Get HuggingFace datasets
+        all_datasets.update(self.get_downloaded_datasets())
+        # Get user datasets
+        all_datasets.update(self.get_user_datasets())
+        return all_datasets
     def download_dataset(self, dataset_key: str, progress_callback=None) -> Dict:
         """