Gamahea commited on
Commit
8b1bcac
·
1 Parent(s): a9970fa

Fix dataset preparation bug and HF repo authentication

Browse files

Critical Fixes:
- Fixed NameError: 'datasets_to_process' undefined -> changed to 'dataset_keys'
- Added repo accessibility check to prevent 401 errors
- HF storage now checks if repo exists and is accessible during init
- All upload/download methods check repo_accessible flag before operations
- Changed warnings to debug logs to reduce console noise

HF Repo Handling:
- On init, checks if Gamahea/lemm-dataset exists and is accessible
- If repo not accessible (no token/401), sets repo_accessible=False
- All operations gracefully skip HF uploads/downloads
- LoRAs and datasets still saved locally
- No more 401 Client Error spam in logs

User Impact:
- Dataset preparation now works without crashes
- Training completes successfully
- HF Space works without HF_TOKEN (local storage only)
- If HF_TOKEN configured, uploads work automatically

Files changed (2) hide show
  1. app.py +1 -1
  2. backend/services/hf_storage_service.py +20 -10
app.py CHANGED
@@ -1186,7 +1186,7 @@ def prepare_datasets_for_training(selected_datasets, max_samples_per_dataset):
1186
  # Upload prepared datasets to HF repo
1187
  status_messages.append(f"\n📤 Uploading prepared datasets to HuggingFace repo...")
1188
  upload_count = 0
1189
- for dataset_key in datasets_to_process:
1190
  dataset_dir = Path("training_data") / dataset_key
1191
  if dataset_dir.exists():
1192
  if hf_storage.upload_dataset(dataset_dir):
 
1186
  # Upload prepared datasets to HF repo
1187
  status_messages.append(f"\n📤 Uploading prepared datasets to HuggingFace repo...")
1188
  upload_count = 0
1189
+ for dataset_key in dataset_keys: # Fixed: was datasets_to_process
1190
  dataset_dir = Path("training_data") / dataset_key
1191
  if dataset_dir.exists():
1192
  if hf_storage.upload_dataset(dataset_dir):
backend/services/hf_storage_service.py CHANGED
@@ -26,15 +26,25 @@ class HFStorageService:
26
 
27
  logger.info(f"HF Storage initialized for repo: {repo_id}")
28
 
29
- # Try to import huggingface_hub
30
  try:
31
  from huggingface_hub import HfApi, hf_hub_download, upload_folder
32
  self.api = HfApi()
33
  self.has_hf = True
34
- logger.info("✅ HuggingFace Hub available")
 
 
 
 
 
 
 
 
 
35
  except ImportError:
36
  logger.warning("⚠️ huggingface_hub not available, using local storage only")
37
  self.has_hf = False
 
38
 
39
  def download_all_loras(self, target_dir: Path) -> List[str]:
40
  """
@@ -46,8 +56,8 @@ class HFStorageService:
46
  Returns:
47
  List of downloaded LoRA names
48
  """
49
- if not self.has_hf:
50
- logger.warning("HuggingFace Hub not available")
51
  return []
52
 
53
  try:
@@ -97,8 +107,8 @@ class HFStorageService:
97
  Returns:
98
  List of downloaded dataset keys
99
  """
100
- if not self.has_hf:
101
- logger.warning("HuggingFace Hub not available")
102
  return []
103
 
104
  try:
@@ -148,8 +158,8 @@ class HFStorageService:
148
  Returns:
149
  True if successful
150
  """
151
- if not self.has_hf:
152
- logger.warning("HuggingFace Hub not available")
153
  return False
154
 
155
  try:
@@ -182,8 +192,8 @@ class HFStorageService:
182
  Returns:
183
  True if successful
184
  """
185
- if not self.has_hf:
186
- logger.warning("HuggingFace Hub not available")
187
  return False
188
 
189
  try:
 
26
 
27
  logger.info(f"HF Storage initialized for repo: {repo_id}")
28
 
29
+ # Try to import huggingface_hub and check authentication
30
  try:
31
  from huggingface_hub import HfApi, hf_hub_download, upload_folder
32
  self.api = HfApi()
33
  self.has_hf = True
34
+
35
+ # Check if repo exists and is accessible
36
+ try:
37
+ self.api.repo_info(repo_id=repo_id, repo_type="dataset")
38
+ logger.info(f"✅ HuggingFace Hub available - repo accessible")
39
+ self.repo_accessible = True
40
+ except Exception as e:
41
+ logger.warning(f"⚠️ HF repo not accessible (may need authentication): {e}")
42
+ logger.warning("Uploads/downloads will be skipped. Local storage only.")
43
+ self.repo_accessible = False
44
  except ImportError:
45
  logger.warning("⚠️ huggingface_hub not available, using local storage only")
46
  self.has_hf = False
47
+ self.repo_accessible = False
48
 
49
  def download_all_loras(self, target_dir: Path) -> List[str]:
50
  """
 
56
  Returns:
57
  List of downloaded LoRA names
58
  """
59
+ if not self.has_hf or not self.repo_accessible:
60
+ logger.debug("HF repo not accessible, skipping LoRA download")
61
  return []
62
 
63
  try:
 
107
  Returns:
108
  List of downloaded dataset keys
109
  """
110
+ if not self.has_hf or not self.repo_accessible:
111
+ logger.debug("HF repo not accessible, skipping dataset download")
112
  return []
113
 
114
  try:
 
158
  Returns:
159
  True if successful
160
  """
161
+ if not self.has_hf or not self.repo_accessible:
162
+ logger.debug(f"HF repo not accessible, LoRA saved locally only: {lora_dir.name}")
163
  return False
164
 
165
  try:
 
192
  Returns:
193
  True if successful
194
  """
195
+ if not self.has_hf or not self.repo_accessible:
196
+ logger.debug(f"HF repo not accessible, dataset saved locally only: {dataset_dir.name}")
197
  return False
198
 
199
  try: