Luke-Bergen
/

Mineral-Nano-1

+#!/usr/bin/env python3
+"""
+Create and upload SafeTensors DIRECTLY to Hugging Face
+No local folder needed - goes straight to your repo!
+"""
+import torch
+import json
+from safetensors.torch import save_file
+from huggingface_hub import HfApi, create_repo
+import tempfile
+import math
+from pathlib import Path
+# ============ CONFIGURE THIS ============
+REPO_NAME = "your-username/mineral-nano-1"  # CHANGE THIS to your HF username!
+HF_TOKEN = "your_token_here"  # Get from huggingface.co/settings/tokens
+# ========================================
+def initialize_weights(shape, init_type="normal", std=0.02):
+    """Initialize tensor with proper initialization"""
+    if init_type == "normal":
+        return torch.randn(shape) * std
+    else:
+        return torch.zeros(shape)
+def create_mineral_nano_weights():
+    """Create all model weights for Mineral Nano 1 Vision"""
+    print("Creating Mineral Nano 1 Vision weights (4.1B parameters)...")
+    state_dict = {}
+    # Configuration
+    vocab_size = 64000
+    hidden_size = 2048
+    intermediate_size = 8192
+    num_layers = 28
+    num_heads = 32
+    num_kv_heads = 8
+    vision_hidden = 1536
+    vision_intermediate = 6144
+    vision_layers = 24
+    vision_patches = 1024
+    # ============ LANGUAGE MODEL ============
+    print("\n[1/4] Language model embeddings...")
+    state_dict["model.embed_tokens.weight"] = initialize_weights(
+        (vocab_size, hidden_size), "normal", 0.02
+    )
+    print("[2/4] Language model layers (28 layers)...")
+    for layer_idx in range(num_layers):
+        prefix = f"model.layers.{layer_idx}"
+        # Attention
+        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
+            (hidden_size, hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
+            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
+            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
+            (hidden_size, hidden_size), "normal", 0.02
+        )
+        # MLP
+        state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
+            (intermediate_size, hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
+            (intermediate_size, hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
+            (hidden_size, intermediate_size), "normal", 0.02
+        )
+        # Norms
+        state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
+        state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
+        if (layer_idx + 1) % 5 == 0:
+            print(f"  ✓ {layer_idx + 1}/{num_layers} layers done")
+    state_dict["model.norm.weight"] = torch.ones(hidden_size)
+    state_dict["lm_head.weight"] = initialize_weights(
+        (vocab_size, hidden_size), "normal", 0.02
+    )
+    print("[3/4] Vision encoder (24 layers)...")
+    # ============ VISION ENCODER ============
+    state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
+        (vision_hidden, 3, 14, 14), "normal", 0.02
+    )
+    state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
+        (vision_hidden,), "normal", 0.02
+    )
+    state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
+        (vision_patches + 1, vision_hidden), "normal", 0.02
+    )
+    for layer_idx in range(vision_layers):
+        prefix = f"vision_model.encoder.layers.{layer_idx}"
+        # Attention
+        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
+        # MLP
+        state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
+            (vision_intermediate, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
+        state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
+            (vision_hidden, vision_intermediate), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
+        # Norms
+        state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
+        state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
+        state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
+        if (layer_idx + 1) % 5 == 0:
+            print(f"  ✓ {layer_idx + 1}/{vision_layers} vision layers done")
+    state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
+    state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
+    print("[4/4] Vision-language projector...")
+    # ============ PROJECTOR ============
+    state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
+        (hidden_size, vision_hidden), "normal", 0.02
+    )
+    state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
+    state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
+        (hidden_size, hidden_size), "normal", 0.02
+    )
+    state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
+    total_params = sum(t.numel() for t in state_dict.values())
+    print(f"\n✓ Created {total_params:,} parameters ({total_params/1e9:.2f}B)")
+    return state_dict
+def upload_to_huggingface(state_dict, repo_name, token):
+    """Upload SafeTensors directly to Hugging Face"""
+    print(f"\n{'='*60}")
+    print(f"Uploading to Hugging Face: {repo_name}")
+    print(f"{'='*60}")
+    # Initialize HF API
+    api = HfApi(token=token)
+    # Create temporary directory for files
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_path = Path(temp_dir)
+        # Split into shards
+        print("\nCreating sharded files...")
+        max_shard_size = 4.5 * 1024 * 1024 * 1024  # 4.5 GB
+        shards = []
+        current_shard = {}
+        current_size = 0
+        shard_idx = 1
+        for key, tensor in state_dict.items():
+            tensor_size = tensor.numel() * tensor.element_size()
+            if current_size + tensor_size > max_shard_size and current_shard:
+                # Save shard
+                shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
+                shard_path = temp_path / shard_file
+                print(f"  Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
+                save_file(current_shard, str(shard_path))
+                shards.append((shard_file, list(current_shard.keys())))
+                current_shard = {}
+                current_size = 0
+                shard_idx += 1
+            current_shard[key] = tensor
+            current_size += tensor_size
+        # Save final shard
+        if current_shard:
+            shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
+            shard_path = temp_path / shard_file
+            print(f"  Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
+            save_file(current_shard, str(shard_path))
+            shards.append((shard_file, list(current_shard.keys())))
+        # Create index
+        index = {
+            "metadata": {
+                "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
+            },
+            "weight_map": {}
+        }
+        for shard_file, keys in shards:
+            for key in keys:
+                index["weight_map"][key] = shard_file
+        index_path = temp_path / "model.safetensors.index.json"
+        with open(index_path, "w") as f:
+            json.dump(index, f, indent=2)
+        print(f"\n{'='*60}")
+        print("Uploading to Hugging Face...")
+        print(f"{'='*60}")
+        # Upload all files
+        for shard_file, _ in shards:
+            file_path = temp_path / shard_file
+            print(f"\n📤 Uploading {shard_file}...")
+            api.upload_file(
+                path_or_fileobj=str(file_path),
+                path_in_repo=shard_file,
+                repo_id=repo_name,
+                repo_type="model",
+                commit_message=f"Add {shard_file}"
+            )
+            print(f"   ✓ Uploaded!")
+        # Upload index
+        print(f"\n📤 Uploading model.safetensors.index.json...")
+        api.upload_file(
+            path_or_fileobj=str(index_path),
+            path_in_repo="model.safetensors.index.json",
+            repo_id=repo_name,
+            repo_type="model",
+            commit_message="Add model index"
+        )
+        print(f"   ✓ Uploaded!")
+    print(f"\n{'='*60}")
+    print(f"��� SUCCESS! Model uploaded to:")
+    print(f"   https://huggingface.co/{repo_name}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    print("="*60)
+    print("Mineral Nano 1 - Direct HuggingFace Upload")
+    print("="*60)
+    # Verify configuration
+    if "your-username" in REPO_NAME or "your_token" in HF_TOKEN:
+        print("\n❌ ERROR: Please configure the script first!")
+        print("\nEdit these lines at the top of the script:")
+        print(f'  REPO_NAME = "your-username/mineral-nano-1"')
+        print(f'  HF_TOKEN = "your_token_here"')
+        print("\nGet your token from: https://huggingface.co/settings/tokens")
+        exit(1)
+    print(f"\nTarget repository: {REPO_NAME}")
+    print("This will take 10-20 minutes...")
+    # Create weights
+    print("\n" + "="*60)
+    print("STEP 1: Creating model weights")
+    print("="*60)
+    state_dict = create_mineral_nano_weights()
+    # Upload to HF
+    print("\n" + "="*60)
+    print("STEP 2: Uploading to Hugging Face")
+    print("="*60)
+    upload_to_huggingface(state_dict, REPO_NAME, HF_TOKEN)
+    print("\n✅ All done! Your model is live on Hugging Face!")