File size: 11,189 Bytes
dbbb522
 
 
439bcc3
dbbb522
 
 
 
 
439bcc3
dbbb522
 
 
439bcc3
 
 
dbbb522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439bcc3
 
dbbb522
 
 
 
 
439bcc3
 
dbbb522
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439bcc3
dbbb522
439bcc3
dbbb522
439bcc3
 
 
dbbb522
 
 
 
439bcc3
 
dbbb522
 
 
 
 
 
 
 
 
 
 
439bcc3
dbbb522
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python3
"""
Create and upload SafeTensors DIRECTLY to Hugging Face
Uses secure login - NO TOKEN IN CODE!
"""

import torch
import json
from safetensors.torch import save_file
from huggingface_hub import HfApi
import tempfile
from pathlib import Path

# ============ CONFIGURE ONLY THIS ============
REPO_NAME = "Luke-Bergen/mineral-nano-1"  # CHANGE THIS to your HF username!
# =============================================

def initialize_weights(shape, init_type="normal", std=0.02):
    """Initialize tensor with proper initialization"""
    if init_type == "normal":
        return torch.randn(shape) * std
    else:
        return torch.zeros(shape)

def create_mineral_nano_weights():
    """Create all model weights for Mineral Nano 1 Vision"""
    
    print("Creating Mineral Nano 1 Vision weights (4.1B parameters)...")
    
    state_dict = {}
    
    # Configuration
    vocab_size = 64000
    hidden_size = 2048
    intermediate_size = 8192
    num_layers = 28
    num_heads = 32
    num_kv_heads = 8
    
    vision_hidden = 1536
    vision_intermediate = 6144
    vision_layers = 24
    vision_patches = 1024
    
    # ============ LANGUAGE MODEL ============
    print("\n[1/4] Language model embeddings...")
    state_dict["model.embed_tokens.weight"] = initialize_weights(
        (vocab_size, hidden_size), "normal", 0.02
    )
    
    print("[2/4] Language model layers (28 layers)...")
    for layer_idx in range(num_layers):
        prefix = f"model.layers.{layer_idx}"
        
        # Attention
        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
            (hidden_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
            (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
            (hidden_size, hidden_size), "normal", 0.02
        )
        
        # MLP
        state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
            (intermediate_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
            (intermediate_size, hidden_size), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
            (hidden_size, intermediate_size), "normal", 0.02
        )
        
        # Norms
        state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
        state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
        
        if (layer_idx + 1) % 5 == 0:
            print(f"  ✓ {layer_idx + 1}/{num_layers} layers done")
    
    state_dict["model.norm.weight"] = torch.ones(hidden_size)
    state_dict["lm_head.weight"] = initialize_weights(
        (vocab_size, hidden_size), "normal", 0.02
    )
    
    print("[3/4] Vision encoder (24 layers)...")
    
    # ============ VISION ENCODER ============
    state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
        (vision_hidden, 3, 14, 14), "normal", 0.02
    )
    state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
        (vision_hidden,), "normal", 0.02
    )
    state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
        (vision_patches + 1, vision_hidden), "normal", 0.02
    )
    
    for layer_idx in range(vision_layers):
        prefix = f"vision_model.encoder.layers.{layer_idx}"
        
        # Attention
        state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
            (vision_hidden, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
        
        # MLP
        state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
            (vision_intermediate, vision_hidden), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
        state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
            (vision_hidden, vision_intermediate), "normal", 0.02
        )
        state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
        
        # Norms
        state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
        state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
        state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
        state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
        
        if (layer_idx + 1) % 5 == 0:
            print(f"  ✓ {layer_idx + 1}/{vision_layers} vision layers done")
    
    state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
    state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
    
    print("[4/4] Vision-language projector...")
    
    # ============ PROJECTOR ============
    state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
        (hidden_size, vision_hidden), "normal", 0.02
    )
    state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
    state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
        (hidden_size, hidden_size), "normal", 0.02
    )
    state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
    
    total_params = sum(t.numel() for t in state_dict.values())
    print(f"\n✓ Created {total_params:,} parameters ({total_params/1e9:.2f}B)")
    
    return state_dict

def upload_to_huggingface(state_dict, repo_name):
    """Upload SafeTensors directly to Hugging Face using saved credentials"""
    
    print(f"\n{'='*60}")
    print(f"Uploading to Hugging Face: {repo_name}")
    print(f"{'='*60}")
    
    # Initialize HF API (uses saved token from huggingface-cli login)
    api = HfApi()
    
    # Create temporary directory for files
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_path = Path(temp_dir)
        
        # Split into shards
        print("\nCreating sharded files...")
        max_shard_size = 4.5 * 1024 * 1024 * 1024  # 4.5 GB
        
        shards = []
        current_shard = {}
        current_size = 0
        shard_idx = 1
        
        for key, tensor in state_dict.items():
            tensor_size = tensor.numel() * tensor.element_size()
            
            if current_size + tensor_size > max_shard_size and current_shard:
                # Save shard
                shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
                shard_path = temp_path / shard_file
                print(f"  Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
                save_file(current_shard, str(shard_path))
                shards.append((shard_file, list(current_shard.keys())))
                
                current_shard = {}
                current_size = 0
                shard_idx += 1
            
            current_shard[key] = tensor
            current_size += tensor_size
        
        # Save final shard
        if current_shard:
            shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
            shard_path = temp_path / shard_file
            print(f"  Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
            save_file(current_shard, str(shard_path))
            shards.append((shard_file, list(current_shard.keys())))
        
        # Create index
        index = {
            "metadata": {
                "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
            },
            "weight_map": {}
        }
        
        for shard_file, keys in shards:
            for key in keys:
                index["weight_map"][key] = shard_file
        
        index_path = temp_path / "model.safetensors.index.json"
        with open(index_path, "w") as f:
            json.dump(index, f, indent=2)
        
        print(f"\n{'='*60}")
        print("Uploading to Hugging Face...")
        print(f"{'='*60}")
        
        # Upload all files
        for shard_file, _ in shards:
            file_path = temp_path / shard_file
            print(f"\n📤 Uploading {shard_file}...")
            api.upload_file(
                path_or_fileobj=str(file_path),
                path_in_repo=shard_file,
                repo_id=repo_name,
                repo_type="model",
                commit_message=f"Add {shard_file}"
            )
            print(f"   ✓ Uploaded!")
        
        # Upload index
        print(f"\n📤 Uploading model.safetensors.index.json...")
        api.upload_file(
            path_or_fileobj=str(index_path),
            path_in_repo="model.safetensors.index.json",
            repo_id=repo_name,
            repo_type="model",
            commit_message="Add model index"
        )
        print(f"   ✓ Uploaded!")
    
    print(f"\n{'='*60}")
    print(f"✅ SUCCESS! Model uploaded to:")
    print(f"   https://huggingface.co/{repo_name}")
    print(f"{'='*60}")

if __name__ == "__main__":
    print("="*60)
    print("Mineral Nano 1 - Direct HuggingFace Upload")
    print("="*60)
    
    # Verify configuration
    if "your-username" in REPO_NAME:
        print("\n❌ ERROR: Please configure the script first!")
        print("\nEdit this line at the top of the script:")
        print(f'  REPO_NAME = "your-username/mineral-nano-1"')
        print("\nChange 'your-username' to YOUR HuggingFace username")
        print("\nThen run: huggingface-cli login")
        print("(Paste your token when prompted)")
        exit(1)
    
    print(f"\nTarget repository: {REPO_NAME}")
    print("This will take 10-20 minutes...")
    print("\nMake sure you ran: huggingface-cli login")
    input("\nPress ENTER to continue or Ctrl+C to cancel...")
    
    # Create weights
    print("\n" + "="*60)
    print("STEP 1: Creating model weights")
    print("="*60)
    state_dict = create_mineral_nano_weights()
    
    # Upload to HF
    print("\n" + "="*60)
    print("STEP 2: Uploading to Hugging Face")
    print("="*60)
    upload_to_huggingface(state_dict, REPO_NAME)
    
    print("\n✅ All done! Your model is live on Hugging Face!")