JacobLinCool commited on
Commit
a4ab071
·
verified ·
1 Parent(s): 7bec83f

Training in progress, step 12

Browse files
config.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3nForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "conf_attention_chunk_size": 12,
7
+ "conf_attention_context_left": 13,
8
+ "conf_attention_context_right": 0,
9
+ "conf_attention_logit_cap": 50.0,
10
+ "conf_conv_kernel_size": 5,
11
+ "conf_num_attention_heads": 8,
12
+ "conf_num_hidden_layers": 12,
13
+ "conf_positional_bias_size": 256,
14
+ "conf_reduction_factor": 4,
15
+ "conf_residual_weight": 0.5,
16
+ "gradient_clipping": 10000000000.0,
17
+ "hidden_size": 1536,
18
+ "input_feat_size": 128,
19
+ "model_type": "gemma3n_audio",
20
+ "rms_norm_eps": 1e-06,
21
+ "sscp_conv_channel_size": [
22
+ 128,
23
+ 32
24
+ ],
25
+ "sscp_conv_eps": 0.001,
26
+ "sscp_conv_group_norm_eps": 0.001,
27
+ "sscp_conv_kernel_size": [
28
+ [
29
+ 3,
30
+ 3
31
+ ],
32
+ [
33
+ 3,
34
+ 3
35
+ ]
36
+ ],
37
+ "sscp_conv_stride_size": [
38
+ [
39
+ 2,
40
+ 2
41
+ ],
42
+ [
43
+ 2,
44
+ 2
45
+ ]
46
+ ],
47
+ "torch_dtype": "float32",
48
+ "vocab_offset": 262272,
49
+ "vocab_size": 128
50
+ },
51
+ "audio_soft_tokens_per_image": 188,
52
+ "audio_token_id": 262273,
53
+ "boa_token_id": 256000,
54
+ "boi_token_id": 255999,
55
+ "eoa_token_id": 262272,
56
+ "eoi_token_id": 262144,
57
+ "eos_token_id": [
58
+ 1,
59
+ 106
60
+ ],
61
+ "image_token_id": 262145,
62
+ "initializer_range": 0.02,
63
+ "model_type": "gemma3n",
64
+ "text_config": {
65
+ "activation_sparsity_pattern": [
66
+ 0.95,
67
+ 0.95,
68
+ 0.95,
69
+ 0.95,
70
+ 0.95,
71
+ 0.95,
72
+ 0.95,
73
+ 0.95,
74
+ 0.95,
75
+ 0.95,
76
+ 0.0,
77
+ 0.0,
78
+ 0.0,
79
+ 0.0,
80
+ 0.0,
81
+ 0.0,
82
+ 0.0,
83
+ 0.0,
84
+ 0.0,
85
+ 0.0,
86
+ 0.0,
87
+ 0.0,
88
+ 0.0,
89
+ 0.0,
90
+ 0.0,
91
+ 0.0,
92
+ 0.0,
93
+ 0.0,
94
+ 0.0,
95
+ 0.0,
96
+ 0.0,
97
+ 0.0,
98
+ 0.0,
99
+ 0.0,
100
+ 0.0
101
+ ],
102
+ "altup_active_idx": 0,
103
+ "altup_coef_clip": 120.0,
104
+ "altup_correct_scale": true,
105
+ "altup_lr_multiplier": 1.0,
106
+ "altup_num_inputs": 4,
107
+ "attention_bias": false,
108
+ "attention_dropout": 0.0,
109
+ "final_logit_softcapping": 30.0,
110
+ "head_dim": 256,
111
+ "hidden_activation": "gelu_pytorch_tanh",
112
+ "hidden_size": 2048,
113
+ "hidden_size_per_layer_input": 256,
114
+ "initializer_range": 0.02,
115
+ "intermediate_size": [
116
+ 16384,
117
+ 16384,
118
+ 16384,
119
+ 16384,
120
+ 16384,
121
+ 16384,
122
+ 16384,
123
+ 16384,
124
+ 16384,
125
+ 16384,
126
+ 16384,
127
+ 16384,
128
+ 16384,
129
+ 16384,
130
+ 16384,
131
+ 16384,
132
+ 16384,
133
+ 16384,
134
+ 16384,
135
+ 16384,
136
+ 16384,
137
+ 16384,
138
+ 16384,
139
+ 16384,
140
+ 16384,
141
+ 16384,
142
+ 16384,
143
+ 16384,
144
+ 16384,
145
+ 16384,
146
+ 16384,
147
+ 16384,
148
+ 16384,
149
+ 16384,
150
+ 16384
151
+ ],
152
+ "laurel_rank": 64,
153
+ "layer_types": [
154
+ "sliding_attention",
155
+ "sliding_attention",
156
+ "sliding_attention",
157
+ "sliding_attention",
158
+ "full_attention",
159
+ "sliding_attention",
160
+ "sliding_attention",
161
+ "sliding_attention",
162
+ "sliding_attention",
163
+ "full_attention",
164
+ "sliding_attention",
165
+ "sliding_attention",
166
+ "sliding_attention",
167
+ "sliding_attention",
168
+ "full_attention",
169
+ "sliding_attention",
170
+ "sliding_attention",
171
+ "sliding_attention",
172
+ "sliding_attention",
173
+ "full_attention",
174
+ "sliding_attention",
175
+ "sliding_attention",
176
+ "sliding_attention",
177
+ "sliding_attention",
178
+ "full_attention",
179
+ "sliding_attention",
180
+ "sliding_attention",
181
+ "sliding_attention",
182
+ "sliding_attention",
183
+ "full_attention",
184
+ "sliding_attention",
185
+ "sliding_attention",
186
+ "sliding_attention",
187
+ "sliding_attention",
188
+ "full_attention"
189
+ ],
190
+ "max_position_embeddings": 32768,
191
+ "model_type": "gemma3n_text",
192
+ "num_attention_heads": 8,
193
+ "num_hidden_layers": 35,
194
+ "num_key_value_heads": 2,
195
+ "num_kv_shared_layers": 15,
196
+ "query_pre_attn_scalar": 256,
197
+ "rms_norm_eps": 1e-06,
198
+ "rope_local_base_freq": 10000.0,
199
+ "rope_scaling": null,
200
+ "rope_theta": 1000000.0,
201
+ "sliding_window": 512,
202
+ "torch_dtype": "float32",
203
+ "use_cache": true,
204
+ "vocab_size": 262400,
205
+ "vocab_size_per_layer_input": 262144
206
+ },
207
+ "torch_dtype": "bfloat16",
208
+ "transformers_version": "4.53.0",
209
+ "vision_config": {
210
+ "architecture": "mobilenetv5_300m_enc",
211
+ "do_pooling": true,
212
+ "hidden_size": 2048,
213
+ "initializer_range": 0.02,
214
+ "label_names": [
215
+ "LABEL_0",
216
+ "LABEL_1"
217
+ ],
218
+ "model_args": null,
219
+ "model_type": "gemma3n_vision",
220
+ "num_classes": 2,
221
+ "rms_norm_eps": 1e-06,
222
+ "torch_dtype": "float32",
223
+ "vocab_offset": 262144,
224
+ "vocab_size": 128
225
+ },
226
+ "vision_soft_tokens_per_image": 256
227
+ }
logs/events.out.tfevents.1751043711.8fa2e96bced5.388750.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cd59204b1e2bc285498766608e6b123b4f45aa2e190e8e95f9b8f3c0b1a66fa
3
+ size 12407
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:922775bd06af0d6346378edf1b3b7b62e6103e707d15657d193ae3c5ce8952bc
3
+ size 4967933712
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5814a18af2209f961e31413218d1548e1cc506798bf7fd25d042a97b249072b3
3
+ size 4569247128
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e662b9751231e4af5933fd1ed292f23f765f6668540a57901ff45ba6da728823
3
+ size 4995580968
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4269ea034a1fbec961faa501643da001e6e3086caac8f732acf9cf3266033d6f
3
+ size 1167419608
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dec283e75b72e538f1cfa1ae8ef0ce7da8ec54fbbfc71eb9139d3275703da1c
3
+ size 5969