danelcsb commited on
Commit
59fbebe
·
verified ·
1 Parent(s): f7030e8

Upload model

Browse files
Files changed (2) hide show
  1. config.json +41 -32
  2. model.safetensors +2 -2
config.json CHANGED
@@ -2,10 +2,8 @@
2
  "architectures": [
3
  "Sam2VideoModel"
4
  ],
5
- "binarize_mask_from_pts_for_mem_enc": true,
6
  "enable_occlusion_spatial_embedding": true,
7
  "enable_temporal_pos_encoding_for_object_pointers": true,
8
- "fill_hole_area": 8,
9
  "image_size": 1024,
10
  "initializer_range": 0.02,
11
  "mask_decoder_config": {
@@ -13,7 +11,6 @@
13
  "dynamic_multimask_stability_delta": 0.05,
14
  "dynamic_multimask_stability_thresh": 0.98,
15
  "dynamic_multimask_via_stability": true,
16
- "feed_forward_hidden_act": "relu",
17
  "hidden_act": "gelu",
18
  "hidden_size": 256,
19
  "iou_head_depth": 3,
@@ -22,8 +19,7 @@
22
  "model_type": "",
23
  "num_attention_heads": 8,
24
  "num_hidden_layers": 2,
25
- "num_multimask_outputs": 3,
26
- "two_way_transformer_activation": "relu"
27
  },
28
  "mask_downsampler_embed_dim": 256,
29
  "mask_downsampler_hidden_act": "gelu",
@@ -32,9 +28,6 @@
32
  "mask_downsampler_stride": 2,
33
  "mask_downsampler_total_stride": 16,
34
  "max_object_pointers_in_encoder": 16,
35
- "memory_attention_apply_pe_at_cross_attn_keys": true,
36
- "memory_attention_apply_pe_at_cross_attn_queries": false,
37
- "memory_attention_apply_pe_at_self_attn": false,
38
  "memory_attention_downsample_rate": 1,
39
  "memory_attention_dropout": 0.1,
40
  "memory_attention_feed_forward_hidden_act": "relu",
@@ -52,21 +45,17 @@
52
  "memory_encoder_output_channels": 64,
53
  "memory_fuser_embed_dim": 256,
54
  "memory_fuser_hidden_act": "gelu",
 
55
  "memory_fuser_kernel_size": 7,
56
  "memory_fuser_layer_scale_init_value": 1e-06,
57
  "memory_fuser_num_layers": 2,
58
  "memory_fuser_padding": 3,
59
- "memory_fuser_use_depthwise_conv": true,
60
- "model_type": "sam2",
61
  "multimask_max_pt_num": 1,
62
  "multimask_min_pt_num": 0,
63
  "multimask_output_for_tracking": true,
64
  "multimask_output_in_sam": true,
65
- "non_overlap_masks": false,
66
- "non_overlap_masks_for_mem_enc": false,
67
  "num_maskmem": 7,
68
- "preserve_temporal_direction_in_object_pointers": true,
69
- "project_temporal_pos_encoding_in_object_pointers": true,
70
  "prompt_encoder_config": {
71
  "hidden_act": "gelu",
72
  "hidden_size": 256,
@@ -81,7 +70,7 @@
81
  "sigmoid_bias_for_mem_enc": -10.0,
82
  "sigmoid_scale_for_mem_enc": 20.0,
83
  "torch_dtype": "float32",
84
- "transformers_version": "4.54.0.dev0",
85
  "vision_config": {
86
  "backbone_channel_list": [
87
  1152,
@@ -95,15 +84,25 @@
95
  "architectures": null,
96
  "bad_words_ids": null,
97
  "begin_suppress_tokens": null,
 
 
 
 
 
 
98
  "bos_token_id": null,
99
  "chunk_size_feed_forward": 0,
100
  "cross_attention_hidden_size": null,
101
  "decoder_start_token_id": null,
102
- "dim_mul": 2.0,
103
  "diversity_penalty": 0.0,
104
  "do_sample": false,
105
- "drop_path_rate": 0.0,
106
  "early_stopping": false,
 
 
 
 
 
 
107
  "encoder_no_repeat_ngram_size": 0,
108
  "eos_token_id": null,
109
  "exponential_decay_length_penalty": null,
@@ -115,14 +114,16 @@
115
  33,
116
  43
117
  ],
118
- "head_mul": 2.0,
119
  "hidden_act": "gelu",
120
  "hidden_size": 144,
121
  "id2label": {
122
  "0": "LABEL_0",
123
  "1": "LABEL_1"
124
  },
125
- "image_size": 1024,
 
 
 
126
  "initializer_range": 0.02,
127
  "is_decoder": false,
128
  "is_encoder_decoder": false,
@@ -134,9 +135,16 @@
134
  "length_penalty": 1.0,
135
  "max_length": 20,
136
  "min_length": 0,
 
137
  "model_type": "sam2_hiera_det_model",
138
  "no_repeat_ngram_size": 0,
139
- "num_attention_heads": 2,
 
 
 
 
 
 
140
  "num_beam_groups": 1,
141
  "num_beams": 1,
142
  "num_channels": 3,
@@ -146,9 +154,18 @@
146
  "output_hidden_states": false,
147
  "output_scores": false,
148
  "pad_token_id": null,
149
- "patch_kernel_size": 7,
150
- "patch_padding": 3,
151
- "patch_stride": 4,
 
 
 
 
 
 
 
 
 
152
  "prefix": null,
153
  "problem_type": null,
154
  "pruned_heads": {},
@@ -161,12 +178,6 @@
161
  "return_dict": true,
162
  "return_dict_in_generate": false,
163
  "sep_token_id": null,
164
- "stages": [
165
- 2,
166
- 6,
167
- 36,
168
- 4
169
- ],
170
  "suppress_tokens": null,
171
  "task_specific_params": null,
172
  "temperature": 1.0,
@@ -184,7 +195,7 @@
184
  7,
185
  7
186
  ],
187
- "window_spec": [
188
  8,
189
  4,
190
  16,
@@ -206,7 +217,6 @@
206
  ]
207
  ],
208
  "fpn_hidden_size": 256,
209
- "fpn_interpolation_mode": "nearest",
210
  "fpn_kernel_size": 1,
211
  "fpn_padding": 0,
212
  "fpn_stride": 1,
@@ -214,7 +224,6 @@
214
  2,
215
  3
216
  ],
217
- "fuse_type": "sum",
218
  "hidden_act": "gelu",
219
  "initializer_range": 0.02,
220
  "layer_norm_eps": 1e-06,
 
2
  "architectures": [
3
  "Sam2VideoModel"
4
  ],
 
5
  "enable_occlusion_spatial_embedding": true,
6
  "enable_temporal_pos_encoding_for_object_pointers": true,
 
7
  "image_size": 1024,
8
  "initializer_range": 0.02,
9
  "mask_decoder_config": {
 
11
  "dynamic_multimask_stability_delta": 0.05,
12
  "dynamic_multimask_stability_thresh": 0.98,
13
  "dynamic_multimask_via_stability": true,
 
14
  "hidden_act": "gelu",
15
  "hidden_size": 256,
16
  "iou_head_depth": 3,
 
19
  "model_type": "",
20
  "num_attention_heads": 8,
21
  "num_hidden_layers": 2,
22
+ "num_multimask_outputs": 3
 
23
  },
24
  "mask_downsampler_embed_dim": 256,
25
  "mask_downsampler_hidden_act": "gelu",
 
28
  "mask_downsampler_stride": 2,
29
  "mask_downsampler_total_stride": 16,
30
  "max_object_pointers_in_encoder": 16,
 
 
 
31
  "memory_attention_downsample_rate": 1,
32
  "memory_attention_dropout": 0.1,
33
  "memory_attention_feed_forward_hidden_act": "relu",
 
45
  "memory_encoder_output_channels": 64,
46
  "memory_fuser_embed_dim": 256,
47
  "memory_fuser_hidden_act": "gelu",
48
+ "memory_fuser_intermediate_dim": 1024,
49
  "memory_fuser_kernel_size": 7,
50
  "memory_fuser_layer_scale_init_value": 1e-06,
51
  "memory_fuser_num_layers": 2,
52
  "memory_fuser_padding": 3,
53
+ "model_type": "sam2_video",
 
54
  "multimask_max_pt_num": 1,
55
  "multimask_min_pt_num": 0,
56
  "multimask_output_for_tracking": true,
57
  "multimask_output_in_sam": true,
 
 
58
  "num_maskmem": 7,
 
 
59
  "prompt_encoder_config": {
60
  "hidden_act": "gelu",
61
  "hidden_size": 256,
 
70
  "sigmoid_bias_for_mem_enc": -10.0,
71
  "sigmoid_scale_for_mem_enc": 20.0,
72
  "torch_dtype": "float32",
73
+ "transformers_version": "4.56.0.dev0",
74
  "vision_config": {
75
  "backbone_channel_list": [
76
  1152,
 
84
  "architectures": null,
85
  "bad_words_ids": null,
86
  "begin_suppress_tokens": null,
87
+ "blocks_per_stage": [
88
+ 2,
89
+ 6,
90
+ 36,
91
+ 4
92
+ ],
93
  "bos_token_id": null,
94
  "chunk_size_feed_forward": 0,
95
  "cross_attention_hidden_size": null,
96
  "decoder_start_token_id": null,
 
97
  "diversity_penalty": 0.0,
98
  "do_sample": false,
 
99
  "early_stopping": false,
100
+ "embed_dim_per_stage": [
101
+ 144,
102
+ 288,
103
+ 576,
104
+ 1152
105
+ ],
106
  "encoder_no_repeat_ngram_size": 0,
107
  "eos_token_id": null,
108
  "exponential_decay_length_penalty": null,
 
114
  33,
115
  43
116
  ],
 
117
  "hidden_act": "gelu",
118
  "hidden_size": 144,
119
  "id2label": {
120
  "0": "LABEL_0",
121
  "1": "LABEL_1"
122
  },
123
+ "image_size": [
124
+ 1024,
125
+ 1024
126
+ ],
127
  "initializer_range": 0.02,
128
  "is_decoder": false,
129
  "is_encoder_decoder": false,
 
135
  "length_penalty": 1.0,
136
  "max_length": 20,
137
  "min_length": 0,
138
+ "mlp_ratio": 4.0,
139
  "model_type": "sam2_hiera_det_model",
140
  "no_repeat_ngram_size": 0,
141
+ "num_attention_heads": 1,
142
+ "num_attention_heads_per_stage": [
143
+ 2,
144
+ 4,
145
+ 8,
146
+ 16
147
+ ],
148
  "num_beam_groups": 1,
149
  "num_beams": 1,
150
  "num_channels": 3,
 
154
  "output_hidden_states": false,
155
  "output_scores": false,
156
  "pad_token_id": null,
157
+ "patch_kernel_size": [
158
+ 7,
159
+ 7
160
+ ],
161
+ "patch_padding": [
162
+ 3,
163
+ 3
164
+ ],
165
+ "patch_stride": [
166
+ 4,
167
+ 4
168
+ ],
169
  "prefix": null,
170
  "problem_type": null,
171
  "pruned_heads": {},
 
178
  "return_dict": true,
179
  "return_dict_in_generate": false,
180
  "sep_token_id": null,
 
 
 
 
 
 
181
  "suppress_tokens": null,
182
  "task_specific_params": null,
183
  "temperature": 1.0,
 
195
  7,
196
  7
197
  ],
198
+ "window_size_per_stage": [
199
  8,
200
  4,
201
  16,
 
217
  ]
218
  ],
219
  "fpn_hidden_size": 256,
 
220
  "fpn_kernel_size": 1,
221
  "fpn_padding": 0,
222
  "fpn_stride": 1,
 
224
  2,
225
  3
226
  ],
 
227
  "hidden_act": "gelu",
228
  "initializer_range": 0.02,
229
  "layer_norm_eps": 1e-06,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d281251cae94754e9c7a2313d45b5f2420cb97024e379f7da3fbcb217a8aabe6
3
- size 897897680
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc407dce21301fd94abb395c5099b4f2c455fdc8a8f261ac3d0ea6d4cd197230
3
+ size 897897416