Petite-LLM-3 / download_model_advanced.py
Tonic's picture
tries to download the model at build time
19b19f0
raw
history blame
7.41 kB
#!/usr/bin/env python3
"""
Advanced helper script to download the int4 model files using HfFileSystem
"""
import os
import sys
import logging
from pathlib import Path
from tqdm import tqdm
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Model configuration
MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
INT4_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft/int4"
LOCAL_MODEL_PATH = "./int4"
def get_file_info(fs, repo_path):
"""Get detailed information about files in the repository"""
try:
files = fs.ls(repo_path, detail=True)
return [f for f in files if f['type'] == 'file']
except Exception as e:
logger.error(f"Error listing files in {repo_path}: {e}")
return []
def download_with_progress(fs, remote_path, local_path, file_size):
"""Download a file with progress bar"""
try:
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(local_path), exist_ok=True)
# Download with progress bar
with tqdm(total=file_size, unit='B', unit_scale=True, desc=os.path.basename(local_path)) as pbar:
with fs.open(remote_path, 'rb') as remote_file:
with open(local_path, 'wb') as local_file:
chunk_size = 8192
while True:
chunk = remote_file.read(chunk_size)
if not chunk:
break
local_file.write(chunk)
pbar.update(len(chunk))
return True
except Exception as e:
logger.error(f"Error downloading {remote_path}: {e}")
return False
def download_model_advanced():
"""Download the int4 model files using advanced HfFileSystem features"""
try:
logger.info(f"Downloading int4 model from {INT4_MODEL_ID}")
# Create local directory if it doesn't exist
os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
# Use HfFileSystem for downloading
from huggingface_hub import HfFileSystem
# Initialize the file system
fs = HfFileSystem()
# Check if repository exists
if not fs.exists(INT4_MODEL_ID):
logger.error(f"Repository {INT4_MODEL_ID} does not exist")
return False
# Get file information
files = get_file_info(fs, INT4_MODEL_ID)
if not files:
logger.error("No files found in repository")
return False
# Filter essential model files
essential_files = [
'config.json',
'pytorch_model.bin',
'tokenizer.json',
'tokenizer_config.json',
'special_tokens_map.json',
'generation_config.json'
]
files_to_download = []
for file_info in files:
file_name = os.path.basename(file_info['name'])
if file_name in essential_files:
files_to_download.append(file_info)
logger.info(f"Found {len(files_to_download)} essential files to download")
# Download each file
successful_downloads = 0
for file_info in files_to_download:
file_path = file_info['name']
file_name = os.path.basename(file_path)
local_file_path = os.path.join(LOCAL_MODEL_PATH, file_name)
file_size = file_info.get('size', 0)
logger.info(f"Downloading {file_name} ({file_size} bytes)...")
# Download the file with progress
if download_with_progress(fs, file_path, local_file_path, file_size):
successful_downloads += 1
logger.info(f"Successfully downloaded {file_name}")
else:
logger.error(f"Failed to download {file_name}")
logger.info(f"Downloaded {successful_downloads}/{len(files_to_download)} files")
return successful_downloads == len(files_to_download)
except Exception as e:
logger.error(f"Error downloading model: {e}")
return False
def verify_download_advanced():
"""Advanced verification of downloaded model files"""
try:
logger.info("Verifying downloaded model files...")
# Expected file sizes (approximate)
expected_files = {
"config.json": (1000, 10000), # (min_size, max_size) in bytes
"pytorch_model.bin": (1000000, 5000000000), # Should be several MB
"tokenizer.json": (10000, 1000000), # Should be several KB
"tokenizer_config.json": (100, 10000), # Minimum size
"special_tokens_map.json": (100, 10000),
"generation_config.json": (100, 10000)
}
verification_results = []
for file_name, (min_size, max_size) in expected_files.items():
file_path = os.path.join(LOCAL_MODEL_PATH, file_name)
if os.path.exists(file_path):
actual_size = os.path.getsize(file_path)
if min_size <= actual_size <= max_size:
logger.info(f"βœ… {file_name} verified ({actual_size} bytes)")
verification_results.append(True)
else:
logger.warning(f"⚠️ {file_name} size unexpected ({actual_size} bytes)")
verification_results.append(False)
else:
logger.error(f"❌ Missing {file_name}")
verification_results.append(False)
success_rate = sum(verification_results) / len(verification_results)
logger.info(f"Verification complete: {sum(verification_results)}/{len(verification_results)} files valid")
return success_rate >= 0.8 # Allow 20% tolerance
except Exception as e:
logger.error(f"Error verifying files: {e}")
return False
def check_model_files():
"""Check if required model files exist"""
required_files = [
"config.json",
"pytorch_model.bin",
"tokenizer.json",
"tokenizer_config.json"
]
missing_files = []
for file in required_files:
file_path = os.path.join(LOCAL_MODEL_PATH, file)
if not os.path.exists(file_path):
missing_files.append(file)
if missing_files:
logger.error(f"Missing model files: {missing_files}")
return False
logger.info("All required model files found")
return True
def main():
"""Main function to download model at build time"""
logger.info("Starting advanced model download for Hugging Face Space...")
# Check if model files already exist
if check_model_files():
logger.info("Model files already exist, skipping download")
return True
# Download the model using advanced method
if download_model_advanced():
# Verify the download
if verify_download_advanced():
logger.info("Model download and verification completed successfully")
return True
else:
logger.error("Model verification failed")
return False
else:
logger.error("Model download failed")
return False
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)