File size: 11,189 Bytes

#!/usr/bin/env python3
"""
Create and upload SafeTensors DIRECTLY to Hugging Face
Uses secure login - NO TOKEN IN CODE!
"""

import torch
import json
from safetensors.torch import save_file
from huggingface_hub import HfApi
import tempfile
from pathlib import Path

# ============ CONFIGURE ONLY THIS ============
REPO_NAME = "Luke-Bergen/mineral-nano-1"  # CHANGE THIS to your HF username!
# =============================================

def initialize_weights(shape, init_type="normal", std=0.02):
    """Initialize tensor with proper initialization"""
    if init_type == "normal":
        return torch.randn(shape) * std
    else:
        return torch.zeros(shape)

def create_mineral_nano_weights():
    """Create all model weights for Mineral Nano 1 Vision"""
    
    print("Creating Mineral Nano 1 Vision weights (4.1B parameters)...")
    
    state_dict = {}
    
    # Configuration
    vocab_size = 64000
    hidden_size = 2048
    intermediate_size = 8192
    num_layers = 28
    num_heads = 32
    num_kv_heads = 8
    
    vision_hidden = 1536
    vision_intermediate = 6144
    vision_layers = 24
    vision_patches = 1024
    
    # ============ LANGUAGE MODEL ============
    print("\n[1/4] Language model embeddings...")
    state_dict["model.embed_tokens.weight"] = initialize_weights(
        (vocab_size, hidden_size), "normal", 0.02
    )
    
    print("[2/4] Language model layers (28 layers)...")
    for layer_idx in range(num_layers):
        prefix = f"model.layers.{layer_idx}"
        
        # Attention
        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
            (hidden_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
            (hidden_size, hidden_size), "normal", 0.02
        )
        
        # MLP
        state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
            (intermediate_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
            (intermediate_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
            (hidden_size, intermediate_size), "normal", 0.02
        )
        
        # Norms
        state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
        state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
        
        if (layer_idx + 1) % 5 == 0:
            print(f"  ✓ {layer_idx + 1}/{num_layers} layers done")
    
    state_dict["model.norm.weight"] = torch.ones(hidden_size)
    state_dict["lm_head.weight"] = initialize_weights(
        (vocab_size, hidden_size), "normal", 0.02
    )
    
    print("[3/4] Vision encoder (24 layers)...")
    
    # ============ VISION ENCODER ============
    state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
        (vision_hidden, 3, 14, 14), "normal", 0.02
    )
    state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
        (vision_hidden,), "normal", 0.02
    )
    state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
        (vision_patches + 1, vision_hidden), "normal", 0.02
    )
    
    for layer_idx in range(vision_layers):
        prefix = f"vision_model.encoder.layers.{layer_idx}"
        
        # Attention
        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
        
        # MLP
        state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
            (vision_intermediate, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
        state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
            (vision_hidden, vision_intermediate), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
        
        # Norms
        state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
        state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
        state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
        
        if (layer_idx + 1) % 5 == 0:
            print(f"  ✓ {layer_idx + 1}/{vision_layers} vision layers done")
    
    state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
    state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
    
    print("[4/4] Vision-language projector...")
    
    # ============ PROJECTOR ============
    state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
        (hidden_size, vision_hidden), "normal", 0.02
    )
    state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
    state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
        (hidden_size, hidden_size), "normal", 0.02
    )
    state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
    
    total_params = sum(t.numel() for t in state_dict.values())
    print(f"\n✓ Created {total_params:,} parameters ({total_params/1e9:.2f}B)")
    
    return state_dict

def upload_to_huggingface(state_dict, repo_name):
    """Upload SafeTensors directly to Hugging Face using saved credentials"""
    
    print(f"\n{'='*60}")
    print(f"Uploading to Hugging Face: {repo_name}")
    print(f"{'='*60}")
    
    # Initialize HF API (uses saved token from huggingface-cli login)
    api = HfApi()
    
    # Create temporary directory for files
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        
        # Split into shards
        print("\nCreating sharded files...")
        max_shard_size = 4.5 * 1024 * 1024 * 1024  # 4.5 GB
        
        shards = []
        current_shard = {}
        current_size = 0
        shard_idx = 1
        
        for key, tensor in state_dict.items():
            tensor_size = tensor.numel() * tensor.element_size()
            
            if current_size + tensor_size > max_shard_size and current_shard:
                # Save shard
                shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
                shard_path = temp_path / shard_file
                print(f"  Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
                save_file(current_shard, str(shard_path))
                shards.append((shard_file, list(current_shard.keys())))
                
                current_shard = {}
                current_size = 0
                shard_idx += 1
            
            current_shard[key] = tensor
            current_size += tensor_size
        
        # Save final shard
        if current_shard:
            shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
            shard_path = temp_path / shard_file
            print(f"  Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
            save_file(current_shard, str(shard_path))
            shards.append((shard_file, list(current_shard.keys())))
        
        # Create index
        index = {
            "metadata": {
                "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
            },
            "weight_map": {}
        }
        
        for shard_file, keys in shards:
            for key in keys:
                index["weight_map"][key] = shard_file
        
        index_path = temp_path / "model.safetensors.index.json"
        with open(index_path, "w") as f:
            json.dump(index, f, indent=2)
        
        print(f"\n{'='*60}")
        print("Uploading to Hugging Face...")
        print(f"{'='*60}")
        
        # Upload all files
        for shard_file, _ in shards:
            file_path = temp_path / shard_file
            print(f"\n📤 Uploading {shard_file}...")
            api.upload_file(
                path_or_fileobj=str(file_path),
                path_in_repo=shard_file,
                repo_id=repo_name,
                repo_type="model",
                commit_message=f"Add {shard_file}"
            )
            print(f"   ✓ Uploaded!")
        
        # Upload index
        print(f"\n📤 Uploading model.safetensors.index.json...")
        api.upload_file(
            path_or_fileobj=str(index_path),
            path_in_repo="model.safetensors.index.json",
            repo_id=repo_name,
            repo_type="model",
            commit_message="Add model index"
        )
        print(f"   ✓ Uploaded!")
    
    print(f"\n{'='*60}")
    print(f"✅ SUCCESS! Model uploaded to:")
    print(f"   https://huggingface.co/{repo_name}")
    print(f"{'='*60}")

if __name__ == "__main__":
    print("="*60)
    print("Mineral Nano 1 - Direct HuggingFace Upload")
    print("="*60)
    
    # Verify configuration
    if "your-username" in REPO_NAME:
        print("\n❌ ERROR: Please configure the script first!")
        print("\nEdit this line at the top of the script:")
        print(f'  REPO_NAME = "your-username/mineral-nano-1"')
        print("\nChange 'your-username' to YOUR HuggingFace username")
        print("\nThen run: huggingface-cli login")
        print("(Paste your token when prompted)")
        exit(1)
    
    print(f"\nTarget repository: {REPO_NAME}")
    print("This will take 10-20 minutes...")
    print("\nMake sure you ran: huggingface-cli login")
    input("\nPress ENTER to continue or Ctrl+C to cancel...")
    
    # Create weights
    print("\n" + "="*60)
    print("STEP 1: Creating model weights")
    print("="*60)
    state_dict = create_mineral_nano_weights()
    
    # Upload to HF
    print("\n" + "="*60)
    print("STEP 2: Uploading to Hugging Face")
    print("="*60)
    upload_to_huggingface(state_dict, REPO_NAME)
    
    print("\n✅ All done! Your model is live on Hugging Face!")