Luke-Bergen
/

Mineral-Nano-1

+#!/usr/bin/env python3
+"""
+Create SafeTensors files for Mineral Nano 1 Vision (Mid-Range 4.1B)
+This creates randomly initialized weights ready for upload to Hugging Face
+"""
+import torch
+import json
+from safetensors.torch import save_file
+from pathlib import Path
+import math
+def initialize_weights(shape, init_type="normal", std=0.02):
+    """Initialize tensor with proper initialization"""
+    if init_type == "normal":
+        return torch.randn(shape) * std
+    elif init_type == "uniform":
+        limit = math.sqrt(3.0 * std)
+        return torch.rand(shape) * 2 * limit - limit
+    else:
+        return torch.zeros(shape)
+def create_mineral_nano_weights():
+    """Create all model weights for Mineral Nano 1 Vision"""
+    print("Creating Mineral Nano 1 Vision SafeTensors (4.1B parameters)...")
+    print("This will take a few minutes and use ~16GB RAM temporarily...")
+    state_dict = {}
+    # Configuration
+    vocab_size = 64000
+    hidden_size = 2048
+    intermediate_size = 8192
+    num_layers = 28
+    num_heads = 32
+    num_kv_heads = 8
+    vision_hidden = 1536
+    vision_intermediate = 6144
+    vision_layers = 24
+    vision_heads = 24
+    vision_patches = 1024  # 448/14 = 32, 32*32 = 1024
+    # ============ LANGUAGE MODEL WEIGHTS ============
+    print("\n[1/4] Creating language model embeddings...")
+    # Token embeddings
+    state_dict["model.embed_tokens.weight"] = initialize_weights(
+        (vocab_size, hidden_size), "normal", 0.02
+    )
+    print("[2/4] Creating language model layers (28 layers)...")
+    # Transformer layers
+    for layer_idx in range(num_layers):
+        prefix = f"model.layers.{layer_idx}"
+        # Self attention
+        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
+            (hidden_size, hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
+            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
+            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
+            (hidden_size, hidden_size), "normal", 0.02
+        )
+        # MLP
+        state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
+            (intermediate_size, hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
+            (intermediate_size, hidden_size), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
+            (hidden_size, intermediate_size), "normal", 0.02
+        )
+        # Layer norms
+        state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
+        state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
+        if (layer_idx + 1) % 5 == 0:
+            print(f"  ✓ Completed {layer_idx + 1}/{num_layers} layers")
+    # Final layer norm and LM head
+    state_dict["model.norm.weight"] = torch.ones(hidden_size)
+    state_dict["lm_head.weight"] = initialize_weights(
+        (vocab_size, hidden_size), "normal", 0.02
+    )
+    print("[3/4] Creating vision encoder (24 layers)...")
+    # ============ VISION ENCODER WEIGHTS ============
+    # Vision embeddings
+    state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
+        (vision_hidden, 3, 14, 14), "normal", 0.02
+    )
+    state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
+        (vision_hidden,), "normal", 0.02
+    )
+    state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
+        (vision_patches + 1, vision_hidden), "normal", 0.02
+    )
+    # Vision transformer layers
+    for layer_idx in range(vision_layers):
+        prefix = f"vision_model.encoder.layers.{layer_idx}"
+        # Self attention
+        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
+            (vision_hidden, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
+        # MLP
+        state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
+            (vision_intermediate, vision_hidden), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
+        state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
+            (vision_hidden, vision_intermediate), "normal", 0.02
+        )
+        state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
+        # Layer norms
+        state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
+        state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
+        state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
+        state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
+        if (layer_idx + 1) % 5 == 0:
+            print(f"  ✓ Completed {layer_idx + 1}/{vision_layers} vision layers")
+    # Vision post-layernorm
+    state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
+    state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
+    print("[4/4] Creating vision-language projection...")
+    # ============ MULTI-MODAL PROJECTOR ============
+    # Project vision features to language model dimension
+    state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
+        (hidden_size, vision_hidden), "normal", 0.02
+    )
+    state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
+    state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
+        (hidden_size, hidden_size), "normal", 0.02
+    )
+    state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
+    return state_dict
+def save_sharded_safetensors(state_dict, output_dir="./mineral-nano-weights"):
+    """Save model in sharded format for Hugging Face"""
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+    print(f"\nSaving SafeTensors files to {output_dir}/")
+    # Calculate sizes and split into shards
+    max_shard_size = 4.5 * 1024 * 1024 * 1024  # 4.5 GB per shard
+    shards = []
+    current_shard = {}
+    current_size = 0
+    shard_idx = 1
+    for key, tensor in state_dict.items():
+        tensor_size = tensor.numel() * tensor.element_size()
+        if current_size + tensor_size > max_shard_size and current_shard:
+            # Save current shard
+            shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
+            print(f"  Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
+            save_file(current_shard, output_path / shard_file)
+            shards.append((shard_file, list(current_shard.keys())))
+            # Start new shard
+            current_shard = {}
+            current_size = 0
+            shard_idx += 1
+        current_shard[key] = tensor
+        current_size += tensor_size
+    # Save final shard
+    if current_shard:
+        shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
+        print(f"  Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
+        save_file(current_shard, output_path / shard_file)
+        shards.append((shard_file, list(current_shard.keys())))
+    # Create index file
+    index = {
+        "metadata": {
+            "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
+        },
+        "weight_map": {}
+    }
+    for shard_file, keys in shards:
+        for key in keys:
+            index["weight_map"][key] = shard_file
+    index_path = output_path / "model.safetensors.index.json"
+    with open(index_path, "w") as f:
+        json.dump(index, f, indent=2)
+    print(f"  ✓ Saved index file: model.safetensors.index.json")
+    # Calculate total parameters
+    total_params = sum(t.numel() for t in state_dict.values())
+    total_size = sum(t.numel() * t.element_size() for t in state_dict.values())
+    print(f"\n{'='*60}")
+    print(f"✓ SUCCESS! Model weights created:")
+    print(f"  Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
+    print(f"  Total Size: {total_size/1e9:.2f} GB")
+    print(f"  Number of Shards: {len(shards)}")
+    print(f"  Output Directory: {output_path.absolute()}")
+    print(f"{'='*60}")
+    print(f"\nNext steps:")
+    print(f"1. Upload these files to Hugging Face:")
+    print(f"   - model-00001-of-00002.safetensors")
+    print(f"   - model-00002-of-00002.safetensors")
+    print(f"   - model.safetensors.index.json")
+    print(f"2. Your model will be ready to load!")
+    print(f"\nNote: These are randomly initialized weights.")
+    print(f"For a working model, you need to train or fine-tune.")
+if __name__ == "__main__":
+    print("="*60)
+    print("Mineral Nano 1 Vision - SafeTensors Generator")
+    print("Mid-Range 4.1B Parameter Model")
+    print("="*60)
+    # Create weights
+    state_dict = create_mineral_nano_weights()
+    # Save to disk
+    save_sharded_safetensors(state_dict)
+    print("\n✓ All done! Your SafeTensors files are ready.")