Luke-Bergen commited on
Commit
dbbb522
·
verified ·
1 Parent(s): 6c759b5

Create upload_directly_to_hf.py

Browse files
Files changed (1) hide show
  1. upload_directly_to_hf.py +294 -0
upload_directly_to_hf.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Create and upload SafeTensors DIRECTLY to Hugging Face
4
+ No local folder needed - goes straight to your repo!
5
+ """
6
+
7
+ import torch
8
+ import json
9
+ from safetensors.torch import save_file
10
+ from huggingface_hub import HfApi, create_repo
11
+ import tempfile
12
+ import math
13
+ from pathlib import Path
14
+
15
+ # ============ CONFIGURE THIS ============
16
+ REPO_NAME = "your-username/mineral-nano-1" # CHANGE THIS to your HF username!
17
+ HF_TOKEN = "your_token_here" # Get from huggingface.co/settings/tokens
18
+ # ========================================
19
+
20
+ def initialize_weights(shape, init_type="normal", std=0.02):
21
+ """Initialize tensor with proper initialization"""
22
+ if init_type == "normal":
23
+ return torch.randn(shape) * std
24
+ else:
25
+ return torch.zeros(shape)
26
+
27
+ def create_mineral_nano_weights():
28
+ """Create all model weights for Mineral Nano 1 Vision"""
29
+
30
+ print("Creating Mineral Nano 1 Vision weights (4.1B parameters)...")
31
+
32
+ state_dict = {}
33
+
34
+ # Configuration
35
+ vocab_size = 64000
36
+ hidden_size = 2048
37
+ intermediate_size = 8192
38
+ num_layers = 28
39
+ num_heads = 32
40
+ num_kv_heads = 8
41
+
42
+ vision_hidden = 1536
43
+ vision_intermediate = 6144
44
+ vision_layers = 24
45
+ vision_patches = 1024
46
+
47
+ # ============ LANGUAGE MODEL ============
48
+ print("\n[1/4] Language model embeddings...")
49
+ state_dict["model.embed_tokens.weight"] = initialize_weights(
50
+ (vocab_size, hidden_size), "normal", 0.02
51
+ )
52
+
53
+ print("[2/4] Language model layers (28 layers)...")
54
+ for layer_idx in range(num_layers):
55
+ prefix = f"model.layers.{layer_idx}"
56
+
57
+ # Attention
58
+ state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
59
+ (hidden_size, hidden_size), "normal", 0.02
60
+ )
61
+ state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
62
+ (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
63
+ )
64
+ state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
65
+ (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
66
+ )
67
+ state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
68
+ (hidden_size, hidden_size), "normal", 0.02
69
+ )
70
+
71
+ # MLP
72
+ state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
73
+ (intermediate_size, hidden_size), "normal", 0.02
74
+ )
75
+ state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
76
+ (intermediate_size, hidden_size), "normal", 0.02
77
+ )
78
+ state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
79
+ (hidden_size, intermediate_size), "normal", 0.02
80
+ )
81
+
82
+ # Norms
83
+ state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
84
+ state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
85
+
86
+ if (layer_idx + 1) % 5 == 0:
87
+ print(f" ✓ {layer_idx + 1}/{num_layers} layers done")
88
+
89
+ state_dict["model.norm.weight"] = torch.ones(hidden_size)
90
+ state_dict["lm_head.weight"] = initialize_weights(
91
+ (vocab_size, hidden_size), "normal", 0.02
92
+ )
93
+
94
+ print("[3/4] Vision encoder (24 layers)...")
95
+
96
+ # ============ VISION ENCODER ============
97
+ state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
98
+ (vision_hidden, 3, 14, 14), "normal", 0.02
99
+ )
100
+ state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
101
+ (vision_hidden,), "normal", 0.02
102
+ )
103
+ state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
104
+ (vision_patches + 1, vision_hidden), "normal", 0.02
105
+ )
106
+
107
+ for layer_idx in range(vision_layers):
108
+ prefix = f"vision_model.encoder.layers.{layer_idx}"
109
+
110
+ # Attention
111
+ state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
112
+ (vision_hidden, vision_hidden), "normal", 0.02
113
+ )
114
+ state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
115
+ state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
116
+ (vision_hidden, vision_hidden), "normal", 0.02
117
+ )
118
+ state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
119
+ state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
120
+ (vision_hidden, vision_hidden), "normal", 0.02
121
+ )
122
+ state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
123
+ state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
124
+ (vision_hidden, vision_hidden), "normal", 0.02
125
+ )
126
+ state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
127
+
128
+ # MLP
129
+ state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
130
+ (vision_intermediate, vision_hidden), "normal", 0.02
131
+ )
132
+ state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
133
+ state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
134
+ (vision_hidden, vision_intermediate), "normal", 0.02
135
+ )
136
+ state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
137
+
138
+ # Norms
139
+ state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
140
+ state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
141
+ state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
142
+ state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
143
+
144
+ if (layer_idx + 1) % 5 == 0:
145
+ print(f" ✓ {layer_idx + 1}/{vision_layers} vision layers done")
146
+
147
+ state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
148
+ state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
149
+
150
+ print("[4/4] Vision-language projector...")
151
+
152
+ # ============ PROJECTOR ============
153
+ state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
154
+ (hidden_size, vision_hidden), "normal", 0.02
155
+ )
156
+ state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
157
+ state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
158
+ (hidden_size, hidden_size), "normal", 0.02
159
+ )
160
+ state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
161
+
162
+ total_params = sum(t.numel() for t in state_dict.values())
163
+ print(f"\n✓ Created {total_params:,} parameters ({total_params/1e9:.2f}B)")
164
+
165
+ return state_dict
166
+
167
+ def upload_to_huggingface(state_dict, repo_name, token):
168
+ """Upload SafeTensors directly to Hugging Face"""
169
+
170
+ print(f"\n{'='*60}")
171
+ print(f"Uploading to Hugging Face: {repo_name}")
172
+ print(f"{'='*60}")
173
+
174
+ # Initialize HF API
175
+ api = HfApi(token=token)
176
+
177
+ # Create temporary directory for files
178
+ with tempfile.TemporaryDirectory() as temp_dir:
179
+ temp_path = Path(temp_dir)
180
+
181
+ # Split into shards
182
+ print("\nCreating sharded files...")
183
+ max_shard_size = 4.5 * 1024 * 1024 * 1024 # 4.5 GB
184
+
185
+ shards = []
186
+ current_shard = {}
187
+ current_size = 0
188
+ shard_idx = 1
189
+
190
+ for key, tensor in state_dict.items():
191
+ tensor_size = tensor.numel() * tensor.element_size()
192
+
193
+ if current_size + tensor_size > max_shard_size and current_shard:
194
+ # Save shard
195
+ shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
196
+ shard_path = temp_path / shard_file
197
+ print(f" Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
198
+ save_file(current_shard, str(shard_path))
199
+ shards.append((shard_file, list(current_shard.keys())))
200
+
201
+ current_shard = {}
202
+ current_size = 0
203
+ shard_idx += 1
204
+
205
+ current_shard[key] = tensor
206
+ current_size += tensor_size
207
+
208
+ # Save final shard
209
+ if current_shard:
210
+ shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
211
+ shard_path = temp_path / shard_file
212
+ print(f" Creating {shard_file} ({current_size / 1e9:.2f} GB)...")
213
+ save_file(current_shard, str(shard_path))
214
+ shards.append((shard_file, list(current_shard.keys())))
215
+
216
+ # Create index
217
+ index = {
218
+ "metadata": {
219
+ "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
220
+ },
221
+ "weight_map": {}
222
+ }
223
+
224
+ for shard_file, keys in shards:
225
+ for key in keys:
226
+ index["weight_map"][key] = shard_file
227
+
228
+ index_path = temp_path / "model.safetensors.index.json"
229
+ with open(index_path, "w") as f:
230
+ json.dump(index, f, indent=2)
231
+
232
+ print(f"\n{'='*60}")
233
+ print("Uploading to Hugging Face...")
234
+ print(f"{'='*60}")
235
+
236
+ # Upload all files
237
+ for shard_file, _ in shards:
238
+ file_path = temp_path / shard_file
239
+ print(f"\n📤 Uploading {shard_file}...")
240
+ api.upload_file(
241
+ path_or_fileobj=str(file_path),
242
+ path_in_repo=shard_file,
243
+ repo_id=repo_name,
244
+ repo_type="model",
245
+ commit_message=f"Add {shard_file}"
246
+ )
247
+ print(f" ✓ Uploaded!")
248
+
249
+ # Upload index
250
+ print(f"\n📤 Uploading model.safetensors.index.json...")
251
+ api.upload_file(
252
+ path_or_fileobj=str(index_path),
253
+ path_in_repo="model.safetensors.index.json",
254
+ repo_id=repo_name,
255
+ repo_type="model",
256
+ commit_message="Add model index"
257
+ )
258
+ print(f" ✓ Uploaded!")
259
+
260
+ print(f"\n{'='*60}")
261
+ print(f"��� SUCCESS! Model uploaded to:")
262
+ print(f" https://huggingface.co/{repo_name}")
263
+ print(f"{'='*60}")
264
+
265
+ if __name__ == "__main__":
266
+ print("="*60)
267
+ print("Mineral Nano 1 - Direct HuggingFace Upload")
268
+ print("="*60)
269
+
270
+ # Verify configuration
271
+ if "your-username" in REPO_NAME or "your_token" in HF_TOKEN:
272
+ print("\n❌ ERROR: Please configure the script first!")
273
+ print("\nEdit these lines at the top of the script:")
274
+ print(f' REPO_NAME = "your-username/mineral-nano-1"')
275
+ print(f' HF_TOKEN = "your_token_here"')
276
+ print("\nGet your token from: https://huggingface.co/settings/tokens")
277
+ exit(1)
278
+
279
+ print(f"\nTarget repository: {REPO_NAME}")
280
+ print("This will take 10-20 minutes...")
281
+
282
+ # Create weights
283
+ print("\n" + "="*60)
284
+ print("STEP 1: Creating model weights")
285
+ print("="*60)
286
+ state_dict = create_mineral_nano_weights()
287
+
288
+ # Upload to HF
289
+ print("\n" + "="*60)
290
+ print("STEP 2: Uploading to Hugging Face")
291
+ print("="*60)
292
+ upload_to_huggingface(state_dict, REPO_NAME, HF_TOKEN)
293
+
294
+ print("\n✅ All done! Your model is live on Hugging Face!")