Luke-Bergen commited on
Commit
6c759b5
·
verified ·
1 Parent(s): 3ed0100

Create create_safetensors.py

Browse files
Files changed (1) hide show
  1. create_safetensors.py +268 -0
create_safetensors.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Create SafeTensors files for Mineral Nano 1 Vision (Mid-Range 4.1B)
4
+ This creates randomly initialized weights ready for upload to Hugging Face
5
+ """
6
+
7
+ import torch
8
+ import json
9
+ from safetensors.torch import save_file
10
+ from pathlib import Path
11
+ import math
12
+
13
+ def initialize_weights(shape, init_type="normal", std=0.02):
14
+ """Initialize tensor with proper initialization"""
15
+ if init_type == "normal":
16
+ return torch.randn(shape) * std
17
+ elif init_type == "uniform":
18
+ limit = math.sqrt(3.0 * std)
19
+ return torch.rand(shape) * 2 * limit - limit
20
+ else:
21
+ return torch.zeros(shape)
22
+
23
+ def create_mineral_nano_weights():
24
+ """Create all model weights for Mineral Nano 1 Vision"""
25
+
26
+ print("Creating Mineral Nano 1 Vision SafeTensors (4.1B parameters)...")
27
+ print("This will take a few minutes and use ~16GB RAM temporarily...")
28
+
29
+ state_dict = {}
30
+
31
+ # Configuration
32
+ vocab_size = 64000
33
+ hidden_size = 2048
34
+ intermediate_size = 8192
35
+ num_layers = 28
36
+ num_heads = 32
37
+ num_kv_heads = 8
38
+
39
+ vision_hidden = 1536
40
+ vision_intermediate = 6144
41
+ vision_layers = 24
42
+ vision_heads = 24
43
+ vision_patches = 1024 # 448/14 = 32, 32*32 = 1024
44
+
45
+ # ============ LANGUAGE MODEL WEIGHTS ============
46
+
47
+ print("\n[1/4] Creating language model embeddings...")
48
+ # Token embeddings
49
+ state_dict["model.embed_tokens.weight"] = initialize_weights(
50
+ (vocab_size, hidden_size), "normal", 0.02
51
+ )
52
+
53
+ print("[2/4] Creating language model layers (28 layers)...")
54
+ # Transformer layers
55
+ for layer_idx in range(num_layers):
56
+ prefix = f"model.layers.{layer_idx}"
57
+
58
+ # Self attention
59
+ state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
60
+ (hidden_size, hidden_size), "normal", 0.02
61
+ )
62
+ state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
63
+ (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
64
+ )
65
+ state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
66
+ (num_kv_heads * (hidden_size // num_heads), hidden_size), "normal", 0.02
67
+ )
68
+ state_dict[f"{prefix}.self_attn.o_proj.weight"] = initialize_weights(
69
+ (hidden_size, hidden_size), "normal", 0.02
70
+ )
71
+
72
+ # MLP
73
+ state_dict[f"{prefix}.mlp.gate_proj.weight"] = initialize_weights(
74
+ (intermediate_size, hidden_size), "normal", 0.02
75
+ )
76
+ state_dict[f"{prefix}.mlp.up_proj.weight"] = initialize_weights(
77
+ (intermediate_size, hidden_size), "normal", 0.02
78
+ )
79
+ state_dict[f"{prefix}.mlp.down_proj.weight"] = initialize_weights(
80
+ (hidden_size, intermediate_size), "normal", 0.02
81
+ )
82
+
83
+ # Layer norms
84
+ state_dict[f"{prefix}.input_layernorm.weight"] = torch.ones(hidden_size)
85
+ state_dict[f"{prefix}.post_attention_layernorm.weight"] = torch.ones(hidden_size)
86
+
87
+ if (layer_idx + 1) % 5 == 0:
88
+ print(f" ✓ Completed {layer_idx + 1}/{num_layers} layers")
89
+
90
+ # Final layer norm and LM head
91
+ state_dict["model.norm.weight"] = torch.ones(hidden_size)
92
+ state_dict["lm_head.weight"] = initialize_weights(
93
+ (vocab_size, hidden_size), "normal", 0.02
94
+ )
95
+
96
+ print("[3/4] Creating vision encoder (24 layers)...")
97
+
98
+ # ============ VISION ENCODER WEIGHTS ============
99
+
100
+ # Vision embeddings
101
+ state_dict["vision_model.embeddings.patch_embedding.weight"] = initialize_weights(
102
+ (vision_hidden, 3, 14, 14), "normal", 0.02
103
+ )
104
+ state_dict["vision_model.embeddings.class_embedding"] = initialize_weights(
105
+ (vision_hidden,), "normal", 0.02
106
+ )
107
+ state_dict["vision_model.embeddings.position_embedding.weight"] = initialize_weights(
108
+ (vision_patches + 1, vision_hidden), "normal", 0.02
109
+ )
110
+
111
+ # Vision transformer layers
112
+ for layer_idx in range(vision_layers):
113
+ prefix = f"vision_model.encoder.layers.{layer_idx}"
114
+
115
+ # Self attention
116
+ state_dict[f"{prefix}.self_attn.q_proj.weight"] = initialize_weights(
117
+ (vision_hidden, vision_hidden), "normal", 0.02
118
+ )
119
+ state_dict[f"{prefix}.self_attn.q_proj.bias"] = torch.zeros(vision_hidden)
120
+
121
+ state_dict[f"{prefix}.self_attn.k_proj.weight"] = initialize_weights(
122
+ (vision_hidden, vision_hidden), "normal", 0.02
123
+ )
124
+ state_dict[f"{prefix}.self_attn.k_proj.bias"] = torch.zeros(vision_hidden)
125
+
126
+ state_dict[f"{prefix}.self_attn.v_proj.weight"] = initialize_weights(
127
+ (vision_hidden, vision_hidden), "normal", 0.02
128
+ )
129
+ state_dict[f"{prefix}.self_attn.v_proj.bias"] = torch.zeros(vision_hidden)
130
+
131
+ state_dict[f"{prefix}.self_attn.out_proj.weight"] = initialize_weights(
132
+ (vision_hidden, vision_hidden), "normal", 0.02
133
+ )
134
+ state_dict[f"{prefix}.self_attn.out_proj.bias"] = torch.zeros(vision_hidden)
135
+
136
+ # MLP
137
+ state_dict[f"{prefix}.mlp.fc1.weight"] = initialize_weights(
138
+ (vision_intermediate, vision_hidden), "normal", 0.02
139
+ )
140
+ state_dict[f"{prefix}.mlp.fc1.bias"] = torch.zeros(vision_intermediate)
141
+
142
+ state_dict[f"{prefix}.mlp.fc2.weight"] = initialize_weights(
143
+ (vision_hidden, vision_intermediate), "normal", 0.02
144
+ )
145
+ state_dict[f"{prefix}.mlp.fc2.bias"] = torch.zeros(vision_hidden)
146
+
147
+ # Layer norms
148
+ state_dict[f"{prefix}.layer_norm1.weight"] = torch.ones(vision_hidden)
149
+ state_dict[f"{prefix}.layer_norm1.bias"] = torch.zeros(vision_hidden)
150
+ state_dict[f"{prefix}.layer_norm2.weight"] = torch.ones(vision_hidden)
151
+ state_dict[f"{prefix}.layer_norm2.bias"] = torch.zeros(vision_hidden)
152
+
153
+ if (layer_idx + 1) % 5 == 0:
154
+ print(f" ✓ Completed {layer_idx + 1}/{vision_layers} vision layers")
155
+
156
+ # Vision post-layernorm
157
+ state_dict["vision_model.post_layernorm.weight"] = torch.ones(vision_hidden)
158
+ state_dict["vision_model.post_layernorm.bias"] = torch.zeros(vision_hidden)
159
+
160
+ print("[4/4] Creating vision-language projection...")
161
+
162
+ # ============ MULTI-MODAL PROJECTOR ============
163
+
164
+ # Project vision features to language model dimension
165
+ state_dict["multi_modal_projector.linear_1.weight"] = initialize_weights(
166
+ (hidden_size, vision_hidden), "normal", 0.02
167
+ )
168
+ state_dict["multi_modal_projector.linear_1.bias"] = torch.zeros(hidden_size)
169
+
170
+ state_dict["multi_modal_projector.linear_2.weight"] = initialize_weights(
171
+ (hidden_size, hidden_size), "normal", 0.02
172
+ )
173
+ state_dict["multi_modal_projector.linear_2.bias"] = torch.zeros(hidden_size)
174
+
175
+ return state_dict
176
+
177
+ def save_sharded_safetensors(state_dict, output_dir="./mineral-nano-weights"):
178
+ """Save model in sharded format for Hugging Face"""
179
+
180
+ output_path = Path(output_dir)
181
+ output_path.mkdir(exist_ok=True)
182
+
183
+ print(f"\nSaving SafeTensors files to {output_dir}/")
184
+
185
+ # Calculate sizes and split into shards
186
+ max_shard_size = 4.5 * 1024 * 1024 * 1024 # 4.5 GB per shard
187
+
188
+ shards = []
189
+ current_shard = {}
190
+ current_size = 0
191
+ shard_idx = 1
192
+
193
+ for key, tensor in state_dict.items():
194
+ tensor_size = tensor.numel() * tensor.element_size()
195
+
196
+ if current_size + tensor_size > max_shard_size and current_shard:
197
+ # Save current shard
198
+ shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
199
+ print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
200
+ save_file(current_shard, output_path / shard_file)
201
+ shards.append((shard_file, list(current_shard.keys())))
202
+
203
+ # Start new shard
204
+ current_shard = {}
205
+ current_size = 0
206
+ shard_idx += 1
207
+
208
+ current_shard[key] = tensor
209
+ current_size += tensor_size
210
+
211
+ # Save final shard
212
+ if current_shard:
213
+ shard_file = f"model-{shard_idx:05d}-of-00002.safetensors"
214
+ print(f" Saving {shard_file} ({current_size / 1e9:.2f} GB)...")
215
+ save_file(current_shard, output_path / shard_file)
216
+ shards.append((shard_file, list(current_shard.keys())))
217
+
218
+ # Create index file
219
+ index = {
220
+ "metadata": {
221
+ "total_size": sum(t.numel() * t.element_size() for t in state_dict.values())
222
+ },
223
+ "weight_map": {}
224
+ }
225
+
226
+ for shard_file, keys in shards:
227
+ for key in keys:
228
+ index["weight_map"][key] = shard_file
229
+
230
+ index_path = output_path / "model.safetensors.index.json"
231
+ with open(index_path, "w") as f:
232
+ json.dump(index, f, indent=2)
233
+
234
+ print(f" ✓ Saved index file: model.safetensors.index.json")
235
+
236
+ # Calculate total parameters
237
+ total_params = sum(t.numel() for t in state_dict.values())
238
+ total_size = sum(t.numel() * t.element_size() for t in state_dict.values())
239
+
240
+ print(f"\n{'='*60}")
241
+ print(f"✓ SUCCESS! Model weights created:")
242
+ print(f" Total Parameters: {total_params:,} ({total_params/1e9:.2f}B)")
243
+ print(f" Total Size: {total_size/1e9:.2f} GB")
244
+ print(f" Number of Shards: {len(shards)}")
245
+ print(f" Output Directory: {output_path.absolute()}")
246
+ print(f"{'='*60}")
247
+ print(f"\nNext steps:")
248
+ print(f"1. Upload these files to Hugging Face:")
249
+ print(f" - model-00001-of-00002.safetensors")
250
+ print(f" - model-00002-of-00002.safetensors")
251
+ print(f" - model.safetensors.index.json")
252
+ print(f"2. Your model will be ready to load!")
253
+ print(f"\nNote: These are randomly initialized weights.")
254
+ print(f"For a working model, you need to train or fine-tune.")
255
+
256
+ if __name__ == "__main__":
257
+ print("="*60)
258
+ print("Mineral Nano 1 Vision - SafeTensors Generator")
259
+ print("Mid-Range 4.1B Parameter Model")
260
+ print("="*60)
261
+
262
+ # Create weights
263
+ state_dict = create_mineral_nano_weights()
264
+
265
+ # Save to disk
266
+ save_sharded_safetensors(state_dict)
267
+
268
+ print("\n✓ All done! Your SafeTensors files are ready.")