patrickvonplaten commited on
Commit
0f52c65
·
verified ·
1 Parent(s): 5e07710

Delete params.json

Browse files
Files changed (1) hide show
  1. params.json +0 -112
params.json DELETED
@@ -1,112 +0,0 @@
1
- {
2
- "dim": 12288,
3
- "n_layers": 88,
4
- "head_dim": 128,
5
- "hidden_dim": 28672,
6
- "n_heads": 96,
7
- "n_kv_heads": 8,
8
- "use_biases": false,
9
- "causal": true,
10
- "rope_theta": 1000000000.0,
11
- "norm_eps": 1e-05,
12
- "init": "DEFAULT",
13
- "dropout": 0.0,
14
- "vocab_size": 32768,
15
- "model_parallel": 1,
16
- "is_sequence_parallel": false,
17
- "context_parallel": 1,
18
- "tied_embeddings": false,
19
- "model_pipelining": 1,
20
- "virtual_model_pipelining": 1,
21
- "efficient_attn": true,
22
- "fused_rms_norm": true,
23
- "ragged_attention": null,
24
- "checkpoint": true,
25
- "use_cache": false,
26
- "max_concurrent_tokens": 65536,
27
- "rms_norm": "PRE",
28
- "cust_bwd": false,
29
- "recompute_w1_every": 0,
30
- "recompute_w3_every": 0,
31
- "recompute_attn_every": 0,
32
- "freeze_nonembedding": false,
33
- "fsdp2": false,
34
- "zero2": false,
35
- "fsdp_optimize_backward_concat_if_pp": true,
36
- "cutlass": false,
37
- "attn_tanh_gating": null,
38
- "softmax_tanh_gating": null,
39
- "deterministic_flash_attn": false,
40
- "moe": null,
41
- "mamba": null,
42
- "multimodal": {
43
- "vis_encoder_id": "ViT-g-16-1024-rope2d",
44
- "encoder_type": "vit_packed_sequence",
45
- "vision_encoder_args": {
46
- "hidden_size": 1408,
47
- "num_channels": 3,
48
- "max_image_size": 1024,
49
- "patch_size": 16,
50
- "inference_args": {
51
- "centering": "default",
52
- "interpolation": "bicubic",
53
- "eval_image_size": 1024,
54
- "variable_image_size": true,
55
- "spatial_merge_size": 1,
56
- "use_cv2_resize": true
57
- },
58
- "stride": null,
59
- "residual_connection": null,
60
- "pos_embed": "rope-2D",
61
- "rope_theta": 10000.0,
62
- "quick_gelu": true,
63
- "ln_pre": true,
64
- "pool_type": "tok",
65
- "final_ln_after_pool": false,
66
- "intermediate_size": 6144,
67
- "projection_dim": 1024,
68
- "num_hidden_layers": 40,
69
- "num_attention_heads": 16,
70
- "qk_norm": false,
71
- "checkpoint": true
72
- },
73
- "vis_encoder_pretraining": null,
74
- "pretrained_model_path": null,
75
- "bos_token_id": 1,
76
- "image_token_id": 10,
77
- "image_newline_id": -1,
78
- "image_end_id": -1,
79
- "load_pretrained_encoder": false,
80
- "mm_projector_id": "mlp",
81
- "use_mlp_bias": false,
82
- "add_layer_norm": false,
83
- "self_attention_args": {
84
- "freeze_vis_encoder": false,
85
- "freeze_lang_decoder": false
86
- },
87
- "cross_attention_args": null,
88
- "use_cross_attention": false,
89
- "spatial_merge_size": 1
90
- },
91
- "quantization": {
92
- "qformat_weight": "fp8_e4m3",
93
- "qformat_emb": null,
94
- "qformat_classif": null,
95
- "qformat_norm": null,
96
- "qformat_act": "fp8_e4m3",
97
- "qformat_kv": "fp8_e4m3",
98
- "qformat_image_tokens": "fp8_e4m3",
99
- "qscheme_weight": "TENSOR",
100
- "qscheme_emb": "TENSOR",
101
- "qscheme_classif": "TENSOR",
102
- "qscheme_act": "TENSOR",
103
- "calibration_steps": 200,
104
- "block_size": 128,
105
- "tie_qkv": true,
106
- "tie_w123_moe": false,
107
- "save_act_logs": false
108
- },
109
- "layer_drop": null,
110
- "override_parameters_str": "",
111
- "max_seq_len": 32768
112
- }