Spaces:

Tonic
/

Petite-LLM-3

Running on Zero

App Files Files Community

Petite-LLM-3 / download_model_advanced.py

Tonic

tries to download the model at build time

19b19f0 4 months ago

raw

history blame

7.41 kB

	#!/usr/bin/env python3
	"""
	Advanced helper script to download the int4 model files using HfFileSystem
	"""

	import os
	import sys
	import logging
	from pathlib import Path
	from tqdm import tqdm

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Model configuration
	MAIN_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft"
	INT4_MODEL_ID = "Tonic/petite-elle-L-aime-3-sft/int4"
	LOCAL_MODEL_PATH = "./int4"

	def get_file_info(fs, repo_path):
	"""Get detailed information about files in the repository"""
	try:
	files = fs.ls(repo_path, detail=True)
	return [f for f in files if f['type'] == 'file']
	except Exception as e:
	logger.error(f"Error listing files in {repo_path}: {e}")
	return []

	def download_with_progress(fs, remote_path, local_path, file_size):
	"""Download a file with progress bar"""
	try:
	# Create directory if it doesn't exist
	os.makedirs(os.path.dirname(local_path), exist_ok=True)

	# Download with progress bar
	with tqdm(total=file_size, unit='B', unit_scale=True, desc=os.path.basename(local_path)) as pbar:
	with fs.open(remote_path, 'rb') as remote_file:
	with open(local_path, 'wb') as local_file:
	chunk_size = 8192
	while True:
	chunk = remote_file.read(chunk_size)
	if not chunk:
	break
	local_file.write(chunk)
	pbar.update(len(chunk))

	return True
	except Exception as e:
	logger.error(f"Error downloading {remote_path}: {e}")
	return False

	def download_model_advanced():
	"""Download the int4 model files using advanced HfFileSystem features"""
	try:
	logger.info(f"Downloading int4 model from {INT4_MODEL_ID}")

	# Create local directory if it doesn't exist
	os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)

	# Use HfFileSystem for downloading
	from huggingface_hub import HfFileSystem

	# Initialize the file system
	fs = HfFileSystem()

	# Check if repository exists
	if not fs.exists(INT4_MODEL_ID):
	logger.error(f"Repository {INT4_MODEL_ID} does not exist")
	return False

	# Get file information
	files = get_file_info(fs, INT4_MODEL_ID)
	if not files:
	logger.error("No files found in repository")
	return False

	# Filter essential model files
	essential_files = [
	'config.json',
	'pytorch_model.bin',
	'tokenizer.json',
	'tokenizer_config.json',
	'special_tokens_map.json',
	'generation_config.json'
	]

	files_to_download = []
	for file_info in files:
	file_name = os.path.basename(file_info['name'])
	if file_name in essential_files:
	files_to_download.append(file_info)

	logger.info(f"Found {len(files_to_download)} essential files to download")

	# Download each file
	successful_downloads = 0
	for file_info in files_to_download:
	file_path = file_info['name']
	file_name = os.path.basename(file_path)
	local_file_path = os.path.join(LOCAL_MODEL_PATH, file_name)
	file_size = file_info.get('size', 0)

	logger.info(f"Downloading {file_name} ({file_size} bytes)...")

	# Download the file with progress
	if download_with_progress(fs, file_path, local_file_path, file_size):
	successful_downloads += 1
	logger.info(f"Successfully downloaded {file_name}")
	else:
	logger.error(f"Failed to download {file_name}")

	logger.info(f"Downloaded {successful_downloads}/{len(files_to_download)} files")
	return successful_downloads == len(files_to_download)

	except Exception as e:
	logger.error(f"Error downloading model: {e}")
	return False

	def verify_download_advanced():
	"""Advanced verification of downloaded model files"""
	try:
	logger.info("Verifying downloaded model files...")

	# Expected file sizes (approximate)
	expected_files = {
	"config.json": (1000, 10000), # (min_size, max_size) in bytes
	"pytorch_model.bin": (1000000, 5000000000), # Should be several MB
	"tokenizer.json": (10000, 1000000), # Should be several KB
	"tokenizer_config.json": (100, 10000), # Minimum size
	"special_tokens_map.json": (100, 10000),
	"generation_config.json": (100, 10000)
	}

	verification_results = []

	for file_name, (min_size, max_size) in expected_files.items():
	file_path = os.path.join(LOCAL_MODEL_PATH, file_name)
	if os.path.exists(file_path):
	actual_size = os.path.getsize(file_path)
	if min_size <= actual_size <= max_size:
	logger.info(f"✅ {file_name} verified ({actual_size} bytes)")
	verification_results.append(True)
	else:
	logger.warning(f"⚠️ {file_name} size unexpected ({actual_size} bytes)")
	verification_results.append(False)
	else:
	logger.error(f"❌ Missing {file_name}")
	verification_results.append(False)

	success_rate = sum(verification_results) / len(verification_results)
	logger.info(f"Verification complete: {sum(verification_results)}/{len(verification_results)} files valid")

	return success_rate >= 0.8 # Allow 20% tolerance

	except Exception as e:
	logger.error(f"Error verifying files: {e}")
	return False

	def check_model_files():
	"""Check if required model files exist"""
	required_files = [
	"config.json",
	"pytorch_model.bin",
	"tokenizer.json",
	"tokenizer_config.json"
	]

	missing_files = []
	for file in required_files:
	file_path = os.path.join(LOCAL_MODEL_PATH, file)
	if not os.path.exists(file_path):
	missing_files.append(file)

	if missing_files:
	logger.error(f"Missing model files: {missing_files}")
	return False

	logger.info("All required model files found")
	return True

	def main():
	"""Main function to download model at build time"""
	logger.info("Starting advanced model download for Hugging Face Space...")

	# Check if model files already exist
	if check_model_files():
	logger.info("Model files already exist, skipping download")
	return True

	# Download the model using advanced method
	if download_model_advanced():
	# Verify the download
	if verify_download_advanced():
	logger.info("Model download and verification completed successfully")
	return True
	else:
	logger.error("Model verification failed")
	return False
	else:
	logger.error("Model download failed")
	return False

	if __name__ == "__main__":
	success = main()
	sys.exit(0 if success else 1)