Gamahea commited on
Commit
661fe20
·
1 Parent(s): 9a8320c

Upgrade to complete datasets with 11TB Pro storage

Browse files

Added: Million Song Dataset, FMA Large, MusicCaps, AudioSet Music, complete NSynth/MAESTRO/Common Voice/LibriSpeech

Files changed (2) hide show
  1. app.py +24 -14
  2. backend/services/dataset_service.py +6 -5
app.py CHANGED
@@ -889,13 +889,18 @@ def download_prepare_datasets(vocal_datasets, symbolic_datasets):
889
  dataset_map = {
890
  # Music datasets
891
  "GTZAN Music Genre (1000 tracks, 10 genres)": "gtzan",
892
- "NSynth Musical Notes (Validation set)": "nsynth_valid",
893
- "MAESTRO Piano Performances (subset)": "maestro",
 
 
 
 
894
  # Vocal & Sound datasets
895
- "LJSpeech (13k vocal clips, single speaker)": "ljspeech",
896
- "Common Voice English (diverse speakers)": "common_voice_en",
897
- "ESC-50 Environmental Sounds (2000 samples)": "esc50",
898
- "Google Speech Commands (short words)": "speech_commands"
 
899
  }
900
 
901
  status_messages = []
@@ -1565,24 +1570,29 @@ with gr.Blocks(
1565
  vocal_datasets = gr.CheckboxGroup(
1566
  choices=[
1567
  "GTZAN Music Genre (1000 tracks, 10 genres)",
1568
- "NSynth Musical Notes (Validation set)",
1569
- "MAESTRO Piano Performances (subset)"
 
 
 
 
1570
  ],
1571
  label="Select Music Datasets",
1572
- info="Music and instrument datasets for style learning"
1573
  )
1574
 
1575
  with gr.Column():
1576
  gr.Markdown("**Vocal & Sound Datasets**")
1577
  symbolic_datasets = gr.CheckboxGroup(
1578
  choices=[
1579
- "LJSpeech (13k vocal clips, single speaker)",
1580
- "Common Voice English (diverse speakers)",
1581
- "ESC-50 Environmental Sounds (2000 samples)",
1582
- "Google Speech Commands (short words)"
 
1583
  ],
1584
  label="Select Vocal/Sound Datasets",
1585
- info="Vocal and sound effect datasets"
1586
  )
1587
 
1588
  dataset_download_btn = gr.Button("📥 Download & Prepare Datasets", variant="secondary")
 
889
  dataset_map = {
890
  # Music datasets
891
  "GTZAN Music Genre (1000 tracks, 10 genres)": "gtzan",
892
+ "NSynth Complete (300k+ musical notes)": "nsynth",
893
+ "MAESTRO Piano Performances (complete)": "maestro",
894
+ "Million Song Dataset (10k subset)": "million_song",
895
+ "Free Music Archive Large (106k tracks)": "fma_large",
896
+ "MusicCaps (5.5k clips with descriptions)": "musiccaps",
897
+ "AudioSet Music (labeled audio events)": "audioset_music",
898
  # Vocal & Sound datasets
899
+ "LJSpeech (13k vocal clips)": "ljspeech",
900
+ "Common Voice English (complete)": "common_voice_en",
901
+ "LibriSpeech Complete (1000 hours)": "librispeech",
902
+ "ESC-50 Environmental Sounds": "esc50",
903
+ "Google Speech Commands": "speech_commands"
904
  }
905
 
906
  status_messages = []
 
1570
  vocal_datasets = gr.CheckboxGroup(
1571
  choices=[
1572
  "GTZAN Music Genre (1000 tracks, 10 genres)",
1573
+ "NSynth Complete (300k+ musical notes)",
1574
+ "MAESTRO Piano Performances (complete)",
1575
+ "Million Song Dataset (10k subset)",
1576
+ "Free Music Archive Large (106k tracks)",
1577
+ "MusicCaps (5.5k clips with descriptions)",
1578
+ "AudioSet Music (labeled audio events)"
1579
  ],
1580
  label="Select Music Datasets",
1581
+ info="Comprehensive music datasets for training (11TB storage available)"
1582
  )
1583
 
1584
  with gr.Column():
1585
  gr.Markdown("**Vocal & Sound Datasets**")
1586
  symbolic_datasets = gr.CheckboxGroup(
1587
  choices=[
1588
+ "LJSpeech (13k vocal clips)",
1589
+ "Common Voice English (complete)",
1590
+ "LibriSpeech Complete (1000 hours)",
1591
+ "ESC-50 Environmental Sounds",
1592
+ "Google Speech Commands"
1593
  ],
1594
  label="Select Vocal/Sound Datasets",
1595
+ info="Complete vocal and sound datasets"
1596
  )
1597
 
1598
  dataset_download_btn = gr.Button("📥 Download & Prepare Datasets", variant="secondary")
backend/services/dataset_service.py CHANGED
@@ -130,12 +130,13 @@ class DatasetService:
130
  if progress_callback:
131
  progress_callback(f"📦 Starting download: {dataset_name}")
132
 
133
- # Warn about dataset size
134
  size_gb = dataset_config.get('size_gb', 0)
135
- if size_gb > 1.0:
136
- progress_callback(f"⚠️ WARNING: Dataset size is {size_gb:.1f} GB")
137
- progress_callback(f" This exceeds the 1 GB HuggingFace Space limit!")
138
- progress_callback(f" Download may fail or fill storage completely.")
 
139
  else:
140
  progress_callback(f"ℹ️ Dataset size: ~{size_gb:.1f} GB")
141
 
 
130
  if progress_callback:
131
  progress_callback(f"📦 Starting download: {dataset_name}")
132
 
133
+ # Show dataset size info
134
  size_gb = dataset_config.get('size_gb', 0)
135
+ if size_gb > 100.0:
136
+ progress_callback(f"⚠️ Large dataset: {size_gb:.1f} GB")
137
+ progress_callback(f" This may take significant time to download.")
138
+ elif size_gb > 10.0:
139
+ progress_callback(f"ℹ️ Dataset size: ~{size_gb:.1f} GB (may take a few minutes)")
140
  else:
141
  progress_callback(f"ℹ️ Dataset size: ~{size_gb:.1f} GB")
142