morgendave commited on
Commit
953435e
·
verified ·
1 Parent(s): 337b343

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Llama4ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_chunk_size": null,
7
+ "attention_dropout": 0.0,
8
+ "attn_scale": 0.1,
9
+ "attn_temperature_tuning": true,
10
+ "attn_temperature_tuning_floor": true,
11
+ "bos_token_id": 200000,
12
+ "cache_implementation": "static",
13
+ "draft_vocab_size": 202048,
14
+ "eos_token_id": 200001,
15
+ "floor_scale": 8192,
16
+ "for_llm_compressor": false,
17
+ "head_dim": 128,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 5120,
20
+ "initializer_range": 0.02,
21
+ "interleave_moe_layer_step": 0,
22
+ "intermediate_size": 8192,
23
+ "intermediate_size_mlp": 16384,
24
+ "max_position_embeddings": 262144,
25
+ "model_type": "llama4_text",
26
+ "moe_layers": [],
27
+ "no_rope_layers": [
28
+ 1,
29
+ 1,
30
+ 1
31
+ ],
32
+ "num_attention_heads": 40,
33
+ "num_experts_per_tok": 1,
34
+ "num_hidden_layers": 3,
35
+ "num_key_value_heads": 8,
36
+ "num_local_experts": 0,
37
+ "output_router_logits": false,
38
+ "pad_token_id": 200018,
39
+ "rms_norm_eps": 1e-05,
40
+ "rope_scaling": {
41
+ "factor": 16,
42
+ "high_freq_factor": 1,
43
+ "low_freq_factor": 1.0,
44
+ "original_max_position_embeddings": 8192,
45
+ "rope_type": "llama3"
46
+ },
47
+ "rope_theta": 500000.0,
48
+ "router_aux_loss_coef": 0.001,
49
+ "router_jitter_noise": 0.0,
50
+ "sliding_window": null,
51
+ "tie_word_embeddings": false,
52
+ "torch_dtype": "bfloat16",
53
+ "transformers_version": "4.52.0.dev0",
54
+ "use_cache": true,
55
+ "use_qk_norm": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 202048,
58
+ "yoco_global_kv_layer": null,
59
+ "yoco_local_kv_layer": null
60
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 200000,
4
+ "cache_implementation": "static",
5
+ "eos_token_id": 200001,
6
+ "pad_token_id": 200018,
7
+ "transformers_version": "4.52.0.dev0"
8
+ }
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32aa9e8e518b9c9dcc6e8fd71253d73781697e63176f9d9c2798ff7420e2c945
3
+ size 4061341136
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db44ae776dad0ffe96d1c71f2dad1c88f643deb1d07efc90d38d54e04668f57b
3
+ size 2068971648
model.safetensors.index.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6130309120
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.fc.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.1.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.2.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.2.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.2.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.norm.weight": "model-00001-of-00002.safetensors"
37
+ }
38
+ }
params.json.raw ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "aggregate_gradients_by_tokens": true,
3
+ "alternate_pp_config": true,
4
+ "async_batch_iterator": false,
5
+ "async_batch_iterator_timeout_s": 600,
6
+ "async_checkpointing": true,
7
+ "async_eval_ngpus": -1,
8
+ "attach_debugpy": false,
9
+ "background_nccl_init": false,
10
+ "batch_p2p_communication": false,
11
+ "batch_size": 2,
12
+ "cached_file_unique_prefix": "",
13
+ "checkpoint": {
14
+ "async_checkpointing_staging_method": "async_copy_async_serialize",
15
+ "barrier_timeout_secs": 120,
16
+ "checkpoint_barrier_type": "sc",
17
+ "checkpoint_gc_use_rmdir": true,
18
+ "checkpoint_groups": false,
19
+ "checkpoint_server_max_attempts": 10,
20
+ "checkpoint_server_num_chunks": 10,
21
+ "checkpoint_server_num_threads": 20,
22
+ "checkpoint_server_op_timeout_secs": 10.0,
23
+ "checkpoint_server_threads": 10,
24
+ "checkpoint_server_timeout_secs": 60.0,
25
+ "dump_freq_ephemeral": -1,
26
+ "eager_init_staging_buffer": false,
27
+ "live_checkpointing": false,
28
+ "on_demand_checkpointing": false,
29
+ "sleep_interval": 10,
30
+ "staging_block_every_n_tensors": -1,
31
+ "timeout_all_shard_exists": 300,
32
+ "timeout_barrier_init_secs": 300,
33
+ "timeout_execution": 1800,
34
+ "timeout_folder_exists": 300,
35
+ "timeout_process_init_secs": 60,
36
+ "use_checkpoint_barrier_tcpstore_libuv": true,
37
+ "use_checkpoint_barrier_wait_for_all_files": true,
38
+ "use_checkpoint_barrier_wait_for_dir": false,
39
+ "use_checkpointing_process": true,
40
+ "use_shm_manager_for_async_cp": false,
41
+ "wait_for_tensor_timeout_s": 120
42
+ },
43
+ "checkpoint_dump_dir": "/mnt/wsfuse/outputs/TI-draft-17bx16MoE-N_3-grtb7hxqrsf75c",
44
+ "collect_et": false,
45
+ "context_parallel_size": 1,
46
+ "data": "",
47
+ "dataloader": {
48
+ "always_trim_text": true,
49
+ "concurrency_timeout_s": 300,
50
+ "concurrent": false,
51
+ "datamix": "",
52
+ "enable_packing": true,
53
+ "image": {
54
+ "image_height": 336,
55
+ "image_width": 336,
56
+ "max_num_chunks": 16,
57
+ "patch_height": 14,
58
+ "patch_width": 14,
59
+ "ps_ratio": 0.5,
60
+ "resize_to_max_canvas": false,
61
+ "suppress_dataloader_errors": false,
62
+ "use_dynamic_transform": true,
63
+ "use_pixel_shuffle": true,
64
+ "use_tile_separator_tags": true
65
+ },
66
+ "load_only_tp_zero": false,
67
+ "logging_config": {
68
+ "log_buffer_size": false,
69
+ "log_every_n_steps": 10,
70
+ "log_first_batch": false,
71
+ "log_full_dataloader_state": false,
72
+ "log_metadata": false
73
+ },
74
+ "max_world_size": null,
75
+ "mix_mode": {},
76
+ "modality_datamix": {
77
+ "image": "/mnt/wsfuse/nikhilmeht/0321/sft/lichengyu/datamixes/llama4/multi_image/round3_sft_mim1_cleaned.json:0.0582033824,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_stem_math_50p_pruned_cleaned.json:0.1513287942,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_chart_50p_pruned_mitigated_blurred_cleaned.json:0.0753349440,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_natural_image_50p_pruned_mitigated_blurred_cleaned.json:0.0753349440,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_stem_non_math_50p_pruned_mitigated_blurred_cleaned.json:0.0527124973,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_diagram_50p_pruned_cleaned.json:0.0169119262,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_table_50p_pruned_cleaned.json:0.0672084340,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_doc_50p_pruned_cleaned.json:0.0419503624,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_infographics_50p_pruned_mitigated_blurred_cleaned.json:0.1172853064,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_others_50p_pruned_cleaned.json:0.0597408302,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_multilingual_50p_pruned_mitigated_blurred_cleaned.json:0.0839007248,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_coding_50p_pruned_cleaned.json:0.0002196354,/mnt/wsfuse/nikhilmeht/0321/sft/trangleminh/datamixes/metaai_diverse_vr7500_frr29k_cse3k_pri2700_cleaned.json:0.0224028113,/mnt/wsfuse/nikhilmeht/0321/sft/szha/data_mix/llama4_video_3p_20250211_cleaned.json:0.0296507797,/mnt/wsfuse/nikhilmeht/0321/sft/nextgen_mm/datasets/zhouxy/datamix/refcoco50kx2resizex1_multbox10k_augprompt_vgblurfix10k_o365dedup3f80k_cleaned.json:0.0912836009,/mnt/wsfuse/nikhilmeht/0321/sft/pengchuanzhang/l4_sft_r4_datamix/jsons/grounding/vcr_distill_val_new_cleaned.json:0.0382832843,meta_ai_i18n_syn_gen_image_sft_v2_03_08_allcountry:0.0098835932,gen_ai_mmllm_llama4_video_1p_sft_blurred_fb_ig_v6_no_audio_reencoded_32frames_20250213_mitigated_20250307:0.0665495278,meta_ai_image_sft_dataflywheel_train_hw_sythetic_data_v2_10k:0.0048319789,gen_ai_mmllm_llama4_multi_image_1p_sft_ig_v5_mitigated:0.0665495278",
78
+ "speech": "",
79
+ "text": "/mnt/wsfuse/users/ashish/yonder3_m20/sag/sft:0.0010000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/yonder2_planner_v5:0.0016000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/yonder2_3_simple_qa_cw_search_trigger_filter:0.0000050000,/mnt/wsfuse/users/ashish/yonder3_m20/agentic_search/yonder2_3_multi_turn_from_mase_no_fdd:0.0000500000,/mnt/wsfuse/users/ashish/yonder3_m20/agentic_search/yonder2_3_sag:0.0004000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/mlt_search_triggering:0.0001013947,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/mlt_no_search_triggering:0.0001783212,/mnt/wsfuse/zihangm/files_from_gsheet/general_helpfulness_english/biography_questions:0.0000300000,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_legal_issues:0.0000119361,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_medical_issues:0.0000119361,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_wellbeing_issues_social_media:0.0000119361,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/hard_legal_mcq_w_reasoning_sft:0.0053461996,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/autoif_filtered_prompt_v1_v2_filtered_pass_75_format_filter_valid_link_fix_code:0.0036983832,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rewrite_iter1_perfect_frr_tone_filter_format_clean_chunk:0.0446290047,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rlhf6pt5_rs_perfect_frr_tone_filter_format_clean_chunk:0.0764934234,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/scale_weak_areas:0.0006326655,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/knn_mitigation_1shot_v1:0.0177627691,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_hard_chunked:0.0055737439,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_medium_chunked:0.0710798804,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_hard_chunked:0.0040405952,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_medium_chunked:0.0345643785,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_general_steerability_2024_train_chunk:0.0005767622,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R3/general_helpfulness_english/table_yonder_oss_helpfulness_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0002796436,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/surge_factuality:0.0100271872,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/synthetic_factuality:0.0202778248,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/manual_factuality:0.0010659686,/mnt/wsfuse/users/ashish/yonder3_m20/factuality/factual_rm_scored_dataset_eq_prod_dataset_v4_w_sys_v2:0.0017008537,/mnt/wsfuse/users/ashish/yonder3_m20/factuality/factual_rm_scored_dataset_eq_dpo_prompt_combined_deduped_w_sys_v2:0.0021703659,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_medium:0.0097702019,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_hard:0.0067510071,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_chat:0.0334625173,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/surge_reasoning:0.0030568351,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/round3_3p_decontaminated_rlhf6_mcq_rscot_50_cjka_fix_nomath_nobio:0.0535004838,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/up_synthetic_verbal_reasoning_405_highq185_few_shot_000:0.0047810467,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/legal_mbe_bar_few_shot_000_format_v3:0.0001168333,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/cpa_few_shot_000_cot_000_format_final_v3:0.0000758074,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/moral_decontaminate:0.0002203347,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/lovish_round6_v2_format_clean:0.0003277076,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/ctg_clean_downsampled0_5_mcq_no_geeks_v3:0.0020298452,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_aime_esbs_100_0_75_M30_SCORED_DECONTAM:0.0002617977,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_harp_esbs_100_0_75_M30_SCORED_DECONTAM:0.0024037928,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_math_train_esbs_100_0_75_M30_SCORED_DECONTAM:0.0101189715,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_omni-math_esbs_100_0_75_M30_SCORED_DECONTAM:0.0014753711,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v1_esbs_100_0_75_M30_SCORED_DECONTAM:0.0152844564,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v2_esbs_100_0_75_M30_SCORED_DECONTAM:0.0174577049,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v3_esbs_100_0_75_M30_SCORED_DECONTAM:0.0096021202,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v4_esbs_100_0_75_M30_SCORED_DECONTAM:0.0072409144,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aime_N20_VERIFIED_SCORED_DECONTAM:0.0001247012,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aops_N20_VERIFIED_SCORED_DECONTAM:0.0044268003,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_harp_N20_VERIFIED_SCORED_DECONTAM:0.0014522667,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_math_N20_VERIFIED_SCORED_DECONTAM:0.0067446853,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_omni_math_N20_VERIFIED_SCORED_DECONTAM:0.0007385598,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v1_N20_VERIFIED_SCORED_DECONTAM:0.0108277141,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v2_N20_VERIFIED_SCORED_DECONTAM:0.0131738396,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v3_N20_VERIFIED_SCORED_DECONTAM:0.0074489788,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v4_N20_VERIFIED_SCORED_DECONTAM:0.0045074783,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250114_r1ab_data:0.0013468764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/brainly_arpg_weak_area_mcq_mitigated_final:0.0011736716,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/reasoning_sft_mcq_final_sbs:0.0017399476,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250106_r1_data_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0072726361,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/careers360_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0080841308,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/afanti_40k_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0127163728,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sql_sft:0.0012350136,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/scale_code_chunk:0.0000529914,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/coding/sft_data_surge_tree_sitter_top_3:0.0702169837,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sft_data_scale_tree_sitter_top_3:0.0086631491,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/turing_tree_sitter_top_3:0.0003084886,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_synthetic_code:0.0430912862,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/suchin_synthetic_code:0.0186100313,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_synthetic_javascript:0.0096131839,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sten_surge_coding_with_exec_mulitpl_synth_240613_v2_format_clean:0.0039608666,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_coding_synthetic_stackoverflow_inspired_samll_multipl_translation_v2_format_clean:0.0119571667,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sten_coding_generated_problem_stack_overflow_L3_405B_self_healing_principled_tests_v8_good_v2_format_clean:0.0074501629,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_filtered_inductive_reasoning:0.0009037746,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_output:0.0009181477,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_input:0.0005912409,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/debug_v5:0.0002433103,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/generated_rl_taco_merged_v2_nostep:0.0389798015,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/sujan-model:0.0173581110,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/shiqi-model:0.0173581110,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_easy:0.0059567374,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_medium:0.0069906931,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_hard:0.0062303026,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/multilingual_r6_5_mix_rm_mathv1:0.0467876285,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/surge_sft_hindi_romanized:0.0001327595,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/scale_sft_final_format_clean_chunk:0.0014430382,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/multilingual/bio:0.0007215191,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT3R2/multilingual/new_rs_souped_multilinugal_critic_rewrite_data_format_clean:0.0056681995,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/rus_v7:0.0063918702,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/zho_v5:0.0065836263,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r2/multilingual/multi_if_sft_data:0.0021400000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/tool_sft_fixlc_false_positives:0.0052821011,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/sft_stack_new:0.0091056645,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/tooling/surge_simple_format_clean_chunk:0.0026361677,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/surge_complex_format_clean_chunk:0.0038867533,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/synthetic_format_clean_chunk:0.0008346811,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/apibank_new:0.0002613764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/plugins_new:0.0005888150,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/scale_tools:0.0000051701,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/ablate_system_prompt_gorilla_format_clean:0.0000063190,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/apibank_json_new_ipython_fix:0.0002613764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/plugins_json_new_ipython_fix:0.0005888150,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/stack_json_fixed_new_v2:0.0091056645,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/swapping_functions_in_two_prompt:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/removing_params_from_ground_truth_function:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/missing_function_in_prompt:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/removing_one_required_param_from_ground_truth_function:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/int:0.0001667479,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/str:0.0001667479,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/skipping_default_params_v1:0.0001417321,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/default_param_model_gen_data_v2:0.0020826816,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/tau_oct26:0.0000035651,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/vr_pdo_p1_lang_0702_chunk:0.0000174005,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tool_safety_sft_vr_image_gen_v3_500_format_clean_chunk:0.0000259024,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tool_safety_multi_turn_sft_vr_web_search_format_clean_chunk:0.0000908000,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tone_edited_refusal_suppression_chunk:0.0002535260,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/table_yonder_safety_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0000510113,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_safety_tools_1k_chunk:0.0000393920,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_safety_prompt_pair_bt_pilot_chunk:0.0000677883,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_mlg_pdo_translation_chunk:0.0002083527,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_mlg_data_format_clean_chunk:0.0007095666,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_borderline_tools_chunk:0.0000532217,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_borderline:0.0013781542,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/scale_pair_prompt_0625_chunk:0.0046856109,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/sc_redteam_format_clean_chunk:0.0000061214,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_teaming_mutated_refusal_lcx_v2_final_chunk:0.0000038542,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_teaming_mtl_mlg_v2_chunk:0.0001079172,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_syn_v3_en_0823_vr_final_chunk:0.0000238053,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_syn_v2_en_0823_vr_final_chunk:0.0000569059,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/multilingual_safety_sft_reannotated_mtl_es_vi_hi_v1_format_clean_chunk:0.0000168904,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/mh_crsv2_xi_052424_mix_3_chunk:0.0003656941,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/l3p_ssh_cse_multi_chunk:0.0003633702,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/halo_cse_cleaned_chunk:0.0000357079,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/halo_cse_813147913798717_chunk:0.0004524132,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/frr_pdo_p1_lang_0702_chunk:0.0000209146,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/election_can_response_format_clean_chunk:0.0000266959,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cybersec_sft_vr_mitre_format_clean_chunk:0.0000507279,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cybersec_sft_frr_mitre_v2_format_clean_chunk:0.0000592864,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/crs_t0_t1_mtl_synthesized_9k_chunk:0.0004737812,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/crs_t0_t1_en_synthesized_vr_4k_chunk:0.0000683551,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/safety/crs_t0_t1_en_synthesized_frr_6k_chunk:0.0003408119,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/covalent_cse_mlg_adversarial_chunk:0.0003261887,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cbrne_sft_format_clean_chunk:0.0001082005,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/borderline_rs_strict_frr_mix5_chunk:0.0009482426,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/kevinyao/proposal/sft/new_qrs_format_cleaned_v4:0.0005078454",
80
+ "video": ""
81
+ },
82
+ "pad_mode": {
83
+ "pad_value": 0,
84
+ "seq_len": 8192
85
+ },
86
+ "pin_memory": true,
87
+ "prefetch_factor": null,
88
+ "progress_reporter_log_interval": 0,
89
+ "rng_mode": {},
90
+ "shuffle_seed": 1337,
91
+ "simulate_training_budget": null,
92
+ "speech": null,
93
+ "split_mode": {
94
+ "discard_text_only": true,
95
+ "keep_interval": 2,
96
+ "keep_strategy": "put_back"
97
+ },
98
+ "tail_token_mode": {},
99
+ "video": {
100
+ "decoder_type": "decord",
101
+ "max_num_chunks": null,
102
+ "max_video_length": -1,
103
+ "min_video_length": -1,
104
+ "num_frames_per_video": 32,
105
+ "resize_to_max_canvas": false,
106
+ "sampling_fps": null,
107
+ "use_dynamic_transform": false,
108
+ "use_ffmpeg_key_frames": false,
109
+ "use_fps": false,
110
+ "use_tile_separator_tags": false
111
+ },
112
+ "weights_update_config": {
113
+ "image_loss_weight": 0.75,
114
+ "image_weight_schedule": [
115
+ [
116
+ 0,
117
+ 0.15
118
+ ],
119
+ [
120
+ 10000,
121
+ 0.5
122
+ ]
123
+ ],
124
+ "speech_weight_schedule": null,
125
+ "text_loss_weight": null,
126
+ "text_weight_schedule": [
127
+ [
128
+ 0,
129
+ 1.0
130
+ ],
131
+ [
132
+ 10000,
133
+ 0.5
134
+ ]
135
+ ],
136
+ "video_loss_weight": null,
137
+ "video_weight_schedule": null
138
+ },
139
+ "workers_per_gpu": 1
140
+ },
141
+ "dataset_iteration_limits": null,
142
+ "deallocate_pipeline_outputs": false,
143
+ "disable_logging": false,
144
+ "disable_workers_print": false,
145
+ "dtype": "bf16",
146
+ "dummy_nccl_init": true,
147
+ "dump_dir": "/mnt/wsfuse/outputs/TI-draft-17bx16MoE-N_3-grtb7hxqrsf75c",
148
+ "dump_dir_tree_type": "sharded",
149
+ "dump_freq": 50,
150
+ "dump_profile_traces": true,
151
+ "eager_init": true,
152
+ "enable_anomaly_detection": false,
153
+ "enable_deterministic_training": false,
154
+ "enable_loss_tracker": true,
155
+ "enable_ods": true,
156
+ "enable_pynvml": false,
157
+ "et_end_itr": 15,
158
+ "et_start_itr": 12,
159
+ "eval_freq": -1,
160
+ "exp_id": "",
161
+ "exp_name": "",
162
+ "expert_parallel_size": 1,
163
+ "finetuning_dir": "",
164
+ "fp32_reduce_scatter": "all",
165
+ "gc_collect_freq": 1000,
166
+ "gpu_check_level": -1,
167
+ "increase_seq": null,
168
+ "instruct": {
169
+ "no_loss_prompt": false,
170
+ "no_loss_truncated": false
171
+ },
172
+ "instruct_data": "",
173
+ "iter_jsonl": {
174
+ "buffer_size": 64,
175
+ "same_data": false
176
+ },
177
+ "iter_multi": {
178
+ "buffer_size": 64,
179
+ "ignore_extra_chunks": true,
180
+ "iterate_chunk_by_chunk": false,
181
+ "max_precompute": 20,
182
+ "multiprocess": true
183
+ },
184
+ "iter_text_airstore": {
185
+ "airstore_max_holding_bundles_limit": 10000,
186
+ "airstore_max_resharding_factor": 128,
187
+ "airstore_sample_prefetch_limit": 10,
188
+ "airstore_seed": 727,
189
+ "dataloader_workers_per_gpu": 1,
190
+ "load_only_pp_zero": false,
191
+ "load_only_tp_zero": false,
192
+ "max_world_size": 8192,
193
+ "pin_memory": true,
194
+ "prefetch_factor": 2,
195
+ "simulate_training_budget": null,
196
+ "unique_token_fraction": null
197
+ },
198
+ "iter_type": "multi",
199
+ "keep_eval_checkpoints": false,
200
+ "keep_n_last_checkpoints": -1,
201
+ "load_optimizer_on_finetuning": false,
202
+ "log": {
203
+ "disable_scalars_tb_write": false,
204
+ "log_loss_tracker_to_scuba": false,
205
+ "log_scalar_default_log_level": "INFO",
206
+ "log_scalar_freq": 100,
207
+ "log_scalar_freq_overrides": "fp8:1000, router/modality:1000, router/dataset:1000, params:0,act:0,grads:0,grads_fsdpv2:0,debug:0,verbose_debug:0",
208
+ "log_scalar_log_level_overrides": "",
209
+ "log_scalar_version": 2.0,
210
+ "log_scalars": false,
211
+ "log_scalars_to_ods": false,
212
+ "log_scalars_to_scuba": false,
213
+ "log_tb": true,
214
+ "log_tensors": false,
215
+ "log_tensors_to_scuba": false,
216
+ "online_wandb": false,
217
+ "online_wandb_project": null,
218
+ "online_wandb_team": null,
219
+ "reduce_scalars": false
220
+ },
221
+ "log_all_steps": true,
222
+ "log_batch_checksum": true,
223
+ "log_dataloader_state": false,
224
+ "log_freq": 1,
225
+ "log_position_in_data_queue": true,
226
+ "log_updates": true,
227
+ "logitwriter": {
228
+ "compression_algo": "zstd",
229
+ "enable": false,
230
+ "index_dtype": "int32",
231
+ "logit_dtype": "float32",
232
+ "same_day_logits_backup": false,
233
+ "speech_topk": 100,
234
+ "topk": 128,
235
+ "write_lse": true
236
+ },
237
+ "loss_logging_freq": 10,
238
+ "loss_rescaling": false,
239
+ "max_image_tiles_per_gpu": 2000,
240
+ "mb_recompute_attn": true,
241
+ "mb_recompute_fc1_fc3": true,
242
+ "mem_snapshot_max_entries": 100000,
243
+ "mem_snapshot_profiling_duration": -1,
244
+ "mem_snapshot_start_step": -1,
245
+ "mem_snapshot_stop_step": -1,
246
+ "memory_efficient_pipeline": false,
247
+ "model": {
248
+ "alpha_depth": "disabled",
249
+ "alpha_lrm": 1.0,
250
+ "alpha_on_resid": false,
251
+ "alpha_separate": false,
252
+ "alpha_wdm": 1.0,
253
+ "attn_bias_type": "block_causal",
254
+ "attn_dropout": 0,
255
+ "attn_out_dropout": 0,
256
+ "attn_temperature_tuning_floor_scale": null,
257
+ "attn_temperature_tuning_layers": null,
258
+ "attn_temperature_tuning_q_scale_constant": null,
259
+ "attn_to_keep": "all",
260
+ "batchify_local_attention_len": null,
261
+ "cp_attn_perdoc": false,
262
+ "cp_attn_save_global_kv": true,
263
+ "custom_bwd": false,
264
+ "custom_bwd_sum_first_then_comms": true,
265
+ "dialog_len": null,
266
+ "dim": 5120,
267
+ "efficient_attn": "auto",
268
+ "efficient_output": false,
269
+ "enable_fsdpv2": true,
270
+ "enable_tp_overlapping": false,
271
+ "enable_weight_sharding_in_pp": false,
272
+ "enable_wgrad_sharding_in_pp": true,
273
+ "eos_id": 200001,
274
+ "every_n_layers_nope": null,
275
+ "experts_choice_moe": {
276
+ "auto_scale_F": true,
277
+ "capacity_factor": 1.0,
278
+ "clamp_above_std": false,
279
+ "compute_moe_in_fp64": false,
280
+ "drop_and_pad": false,
281
+ "enable_lb_free": false,
282
+ "enable_lb_loss": false,
283
+ "enable_router_zloss": false,
284
+ "eval_threshold_std_mult": 0.0,
285
+ "eval_with_expert_activation_model": false,
286
+ "eval_with_saved_stats": true,
287
+ "eval_with_top_k": false,
288
+ "expert_act_grad_prop_coeff": 0,
289
+ "expert_act_init_std": 0.5,
290
+ "expert_act_loss_coeff": 0.0001,
291
+ "expert_act_silu": false,
292
+ "expert_act_threshold": 0,
293
+ "expert_activation_model": false,
294
+ "fc1_clamp": null,
295
+ "fc2_clamp": null,
296
+ "fc3_clamp": null,
297
+ "fix_datasource_router_score": "",
298
+ "fix_image_router_score": null,
299
+ "fix_speech_router_score": null,
300
+ "force_looped_impl": false,
301
+ "fused_shuffle": true,
302
+ "input_scaling": false,
303
+ "input_scaling_max_clamp": 2.0,
304
+ "input_scaling_min_clamp": -2.0,
305
+ "interleave_moe_layer_step": 1,
306
+ "is_enabled": false,
307
+ "lb_free_coeff": 0.0,
308
+ "lb_loss_coeff": null,
309
+ "max_experts_per_token": null,
310
+ "moe_init_scale": 1.0,
311
+ "mult_moe_weight_grads": null,
312
+ "norm_expert_output": null,
313
+ "num_experts": 16,
314
+ "overlap_token_comm": true,
315
+ "postgate_experts": false,
316
+ "recompute_capacity_factor": null,
317
+ "routed_dropout": 0.0,
318
+ "router_clamp": null,
319
+ "router_kld_reg": 0.0,
320
+ "router_padding_coeff": null,
321
+ "router_score_gating": "sigmoid",
322
+ "router_zloss_coeff": 2.4643796217322647e-19,
323
+ "running_stats_ema": 0.99,
324
+ "running_stats_sync_freq": 100,
325
+ "saved_thresholds_are_post_sigmoid": false,
326
+ "sharding_strategy": "tp",
327
+ "shuffle_before_assign": false,
328
+ "shuffle_freq": 1,
329
+ "shuffle_group_size": 32,
330
+ "shuffle_level": 3,
331
+ "shuffle_with_random_order": true,
332
+ "shuffle_within_dp": true,
333
+ "sigmoid_in_fp32": true,
334
+ "skip_local_shuffle": false,
335
+ "std_margin": 15,
336
+ "std_margin_skip_last": true,
337
+ "std_penalty_coeff": 0.0,
338
+ "top_k": 1,
339
+ "use_fixed_topk": false,
340
+ "use_fixed_topk_bsz": 1,
341
+ "use_fsdp": true,
342
+ "use_shared_expert": true,
343
+ "use_te_in_moe": false,
344
+ "use_token_choice": true,
345
+ "zero_clamp_grads": true,
346
+ "zero_router_grads": false
347
+ },
348
+ "ffn_dim_multiplier": 1.2,
349
+ "ffn_exp": 4.0,
350
+ "ffn_in_dropout": 0,
351
+ "ffn_out_dropout": 0,
352
+ "flex_score_mod": "",
353
+ "fp8_amax_compute_algo": "max",
354
+ "fp8_amax_history_len": 1024,
355
+ "fp8_early_bf16_weight_release": false,
356
+ "fp8_fuse_wgrad_accumulation": false,
357
+ "fp8_grad_output_dynamic_scale": false,
358
+ "fp8_input_dynamic_scale": false,
359
+ "fp8_interval": 1,
360
+ "fp8_margin": 0,
361
+ "fp8_rowwise": false,
362
+ "fp8_wgrad": false,
363
+ "freeze_decoder": false,
364
+ "freeze_patterns": null,
365
+ "freeze_vision_encoder": false,
366
+ "fsdp_checkpoint_wrap_layer_frequency": 1,
367
+ "fsdpv1_flatten_params": true,
368
+ "fsdpv2_cast_root_forward_inputs": false,
369
+ "fsdpv2_cpu_offload_percentage": null,
370
+ "fsdpv2_enable_cpu_offload": false,
371
+ "fsdpv2_use_per_pg_streams": true,
372
+ "fsdpv2_wrap_pp_model_chunk_only": false,
373
+ "fuse_sequence_parallel": true,
374
+ "global_attn_cfg": "all",
375
+ "head_dim": null,
376
+ "high_freq_factor": 1,
377
+ "hsdp_replicate_num": 1,
378
+ "init": {
379
+ "coeff_std": null,
380
+ "depth_last": false,
381
+ "fixed_std": null,
382
+ "no_init": false,
383
+ "router_coeff_std": 0.1,
384
+ "truncate_std_mult": 2.0,
385
+ "use_depth": "current",
386
+ "use_gaussian": true
387
+ },
388
+ "layer_ckpt": "all",
389
+ "lc_rope_len": 0,
390
+ "lc_rope_prob": 0.0,
391
+ "less_layer_first_pp_stage": 0,
392
+ "less_layer_last_pp_stage": 0,
393
+ "local_attention_window_len": null,
394
+ "loss_parallel": true,
395
+ "max_length": 2048,
396
+ "metap": {
397
+ "base_width": 1024.0,
398
+ "coeff_std": 1.0,
399
+ "m_emb": 1.0,
400
+ "metap_mode": "ntp",
401
+ "tie_router_bulk_coeff_std": false,
402
+ "use_metap": false
403
+ },
404
+ "modalities": {
405
+ "freeze_llm": false,
406
+ "image": {
407
+ "enable_projection": true,
408
+ "encoder_name": "llama4_flash_encoder",
409
+ "encoder_params": null,
410
+ "freeze_vision_encoder": true,
411
+ "image_height": 336,
412
+ "image_width": 336,
413
+ "patch_height": 14,
414
+ "patch_width": 14,
415
+ "ps_ratio": 0.5,
416
+ "recompute_transformer": true,
417
+ "return_intermediate": null,
418
+ "use_cached_embeddings": false,
419
+ "use_dynamic_transform": true,
420
+ "vision_adapter_type": "pixel_shuffle_mlp",
421
+ "vision_encoder_ckpt_path": "/mnt/wsfuse/nextgen_mm/vision_encoders/llama4_flash_encoder_final_1023_ema",
422
+ "vision_encoding_batch_size": null
423
+ },
424
+ "speech": {
425
+ "append_quantization_output": false,
426
+ "data_format_args": {
427
+ "disallow_text_free_seg": true,
428
+ "emit_text_right_after_sys_start": true,
429
+ "enable_speech_text_hybrid": false,
430
+ "hybrid_generation_mode": "single_token_emit",
431
+ "hybrid_understanding_mode": "streaming",
432
+ "jitter_system_prompt": false,
433
+ "jitter_system_prompt_today_date": false,
434
+ "num_words_in_unit": 1,
435
+ "speech_delay": 1,
436
+ "system_text_lookahead": 0,
437
+ "tool_token_delay_ms_max": 100,
438
+ "tool_token_delay_ms_min": 0,
439
+ "transfer_dates_to_template": false,
440
+ "turn_start_with_white_space": false,
441
+ "user_text_delay": 6
442
+ },
443
+ "discrete_codebooks_size": 65536,
444
+ "enable_aux_user_output": false,
445
+ "enable_full_duplex": true,
446
+ "enable_output": true,
447
+ "encoder_device": "cpu",
448
+ "freeze_speech_encoder": true,
449
+ "is_tokenizer": true,
450
+ "load_tokenizer": true,
451
+ "share_speech_emb": false,
452
+ "speech_encoder": "v2_2411",
453
+ "speech_encoder_ckpt_dir": null,
454
+ "speech_extend_vocab_size": 256,
455
+ "speech_feature_dim": 640,
456
+ "speech_output_control_format": "",
457
+ "speech_projection_dim": 1536,
458
+ "speech_separate_softmax": true,
459
+ "speech_train_audio_end": false,
460
+ "speech_train_audio_start": false,
461
+ "target_speaker_table_size": 0,
462
+ "use_discrete_codes": true,
463
+ "use_embedding": true,
464
+ "use_fp32_for_speech_output": true,
465
+ "use_fp64": true,
466
+ "use_projection": false,
467
+ "user_embedding_by_concat": false,
468
+ "user_embedding_by_permutation": true,
469
+ "user_projection_use_mlp": false
470
+ },
471
+ "use_image": false,
472
+ "use_speech": false,
473
+ "use_video": false
474
+ },
475
+ "multiple_of": 2048,
476
+ "n_heads": 40,
477
+ "n_kv_heads": 8,
478
+ "n_layers": 3,
479
+ "non_linearity": "swiglu",
480
+ "nope_no_qk_norm": true,
481
+ "norm_affine": true,
482
+ "norm_eps": 1e-05,
483
+ "norm_type": "rmsnorm",
484
+ "num_unfrozen_layers": 0,
485
+ "output_size": 202048,
486
+ "parallel_decoding": {
487
+ "enable_fc_parallelism": true,
488
+ "fc_with_bias": false,
489
+ "first_and_last_norm_required": true,
490
+ "has_parallel_decoding": true,
491
+ "parallel_decoding_type": "EAGLE",
492
+ "share_input_output_embed_with_target": true
493
+ },
494
+ "parallel_output_norm": true,
495
+ "peft_args": null,
496
+ "pp_use_tensor_pool": false,
497
+ "pre_norm": true,
498
+ "prefetch_weight_latency": 1.0,
499
+ "qat_args": null,
500
+ "qk_norm_across_heads": false,
501
+ "qk_norm_affine": false,
502
+ "recompute_attn": false,
503
+ "recompute_fc1_out": true,
504
+ "recompute_fc3_out": true,
505
+ "recompute_q_norm": false,
506
+ "rope_attn_scale": false,
507
+ "rope_scale_factor": 16,
508
+ "rope_theta": 500000.0,
509
+ "rope_use_fp32_in_outer_product": true,
510
+ "sequence_parallel": true,
511
+ "share_emb": false,
512
+ "stochastic_depth_p_attn": 0,
513
+ "stochastic_depth_p_ffn": 0,
514
+ "te_use_fsdp_mixed_precision": true,
515
+ "use_flex_attn": false,
516
+ "use_fp8": false,
517
+ "use_qk_norm": true,
518
+ "use_rope": true,
519
+ "use_scaled_rope": true,
520
+ "use_te_layers": true,
521
+ "vocab_parallel": true,
522
+ "vocab_size": 202048
523
+ },
524
+ "model_parallel_size": 8,
525
+ "model_precheck": false,
526
+ "nan_detector_steps": 0,
527
+ "no_final_ckpt": false,
528
+ "num_layers_per_virtual_pipeline_stage": null,
529
+ "num_microbatches_with_partial_activation_checkpoints": 1,
530
+ "number_of_manifold_servers_per_host": 8,
531
+ "old_mp": -1,
532
+ "old_world_size": -1,
533
+ "optim": {
534
+ "annealing_step": 10000,
535
+ "beta1": 0.9,
536
+ "beta2": 0.95,
537
+ "clip": 1.0,
538
+ "cosine_theta": 1.0,
539
+ "cycle_length": 1.0,
540
+ "decay_length_fraction": 0.1,
541
+ "epsilon": 1e-08,
542
+ "exp_factor": 0.5,
543
+ "fused": null,
544
+ "grad_accumulate_steps": 1,
545
+ "independent_weight_decay": false,
546
+ "lr": 0.0002,
547
+ "lr_min_ratio": 0.1,
548
+ "modality_order": "text,vision,speech,speech_full_duplex",
549
+ "non_nope_lr_mult": null,
550
+ "nope_lr_mult": null,
551
+ "scheduler": "constant",
552
+ "start_annealing_step": -1,
553
+ "use_fp32_copy_optim": true,
554
+ "vision_encoder_lr": null,
555
+ "vision_projection_lr": null,
556
+ "warmup": 400,
557
+ "weight_decay": 0.1
558
+ },
559
+ "optimize_backward_concat": false,
560
+ "overlap_p2p_communication": true,
561
+ "paft": {
562
+ "all_reduce_timeout_grow_ms": 300000,
563
+ "all_reduce_timeout_ms": 60000,
564
+ "ctran_port_base": 18700,
565
+ "enable": false,
566
+ "ib_exchange_port_base": 18600,
567
+ "max_quorum_num_retries": 5,
568
+ "max_step_retries": 5,
569
+ "min_replicas_to_run": null,
570
+ "qp_connect_timeout_ms": null,
571
+ "replica_collective_timeout_s": 600,
572
+ "send_recv_timeout_ms": 5000,
573
+ "startup_sleep_ms": 10000,
574
+ "test_only_barrier_timeout_s": 180,
575
+ "test_only_skip_ftar": false
576
+ },
577
+ "periodic_gpu_check": false,
578
+ "pg_tuning_options_from_yaml": "",
579
+ "pipeline_parallel_microbatch_size": 1,
580
+ "pipeline_parallel_size": 1,
581
+ "pipeline_strategy": "dora",
582
+ "power_consumer": {
583
+ "enable": false,
584
+ "run_delay_steps": 0,
585
+ "run_duration_steps": 100,
586
+ "run_freq": 1000,
587
+ "run_mode": "periodic"
588
+ },
589
+ "pp_num_warm_up_microbatch_ratio": 1.0,
590
+ "profile_acc_events": false,
591
+ "profile_barrier_timeout_s": 0,
592
+ "profile_freq": -1,
593
+ "profile_num_steps_active": 1,
594
+ "profile_record_shapes": true,
595
+ "profile_with_stack": false,
596
+ "py_spy_args": {
597
+ "active_seconds": 600,
598
+ "format": "flamegraph",
599
+ "freq": -1,
600
+ "rank0_only": true,
601
+ "rate": 50,
602
+ "start_offset": 10
603
+ },
604
+ "recompute_all_mb": false,
605
+ "reshard_after_forward": true,
606
+ "restore_dataloader_position": false,
607
+ "root_dump_dir": "/mnt/wsfuse/outputs/xldumps",
608
+ "runtime_nccl_timeout_s": 600,
609
+ "sample_across_datasets": true,
610
+ "seq_len": 8192,
611
+ "skip_evals_during_training": true,
612
+ "slurm": {
613
+ "global_rank": 0,
614
+ "is_slurm_job": false,
615
+ "role_index": 0,
616
+ "role_rank": 0,
617
+ "role_replica_count": 1,
618
+ "role_world_size": 256,
619
+ "world_size": 256
620
+ },
621
+ "speech_loss": {
622
+ "aux_aligned_text_loss": false,
623
+ "aux_user_loss_weight": 0.9,
624
+ "dual_channel_aux_user_loss_weight": 0.9,
625
+ "enable": false,
626
+ "force_simulated_sys_loss": true,
627
+ "full_duplex_dual_loss_mode": "sample",
628
+ "kind": "separate_softmax",
629
+ "log_logits": false,
630
+ "maybe_tool_token_loss_weight": null,
631
+ "perfect_silence_id": null,
632
+ "speech_loss_weight": null,
633
+ "system_floors_weight": null,
634
+ "system_perfect_silence_weight": 0.0,
635
+ "system_text_escape_audio_weight": null,
636
+ "user_text_escape_audio_weight": null
637
+ },
638
+ "steps": 1050000,
639
+ "text_only_steps": null,
640
+ "tokenizer": {
641
+ "path": "/mnt/wsfuse/tokenizers/tiktoken/l4_200k_base",
642
+ "version": "llama4_tiktoken_v6"
643
+ },
644
+ "tokenizer_dir": "/mnt/wsfuse/tokenizers/tiktoken",
645
+ "torch_seed": 0,
646
+ "unlimited_steps": false,
647
+ "use_sum_loss": false,
648
+ "valid": {
649
+ "batch_size": 32,
650
+ "debug": false,
651
+ "majority_voting": 0,
652
+ "n_batches": 100,
653
+ "ppl_files_str": "",
654
+ "prompt_path": "",
655
+ "random_fewshots": false,
656
+ "seed": 42,
657
+ "seq_len": 2048,
658
+ "skip_sanity_check": false,
659
+ "tasks_root_dir": "",
660
+ "tasks_str": "",
661
+ "temperature": 0.0,
662
+ "top_k": 0,
663
+ "top_p": 0.0,
664
+ "use_sampling": false,
665
+ "write_eval": false
666
+ },
667
+ "z_loss_multiplier": 0.0
668
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "image_processor_type": "Llama4ImageProcessorFast",
17
+ "image_std": [
18
+ 0.5,
19
+ 0.5,
20
+ 0.5
21
+ ],
22
+ "input_data_format": null,
23
+ "max_patches": 16,
24
+ "processor_class": "Llama4Processor",
25
+ "resample": 2,
26
+ "rescale_factor": 0.00392156862745098,
27
+ "resize_to_max_canvas": false,
28
+ "return_tensors": null,
29
+ "size": {
30
+ "height": 336,
31
+ "width": 336
32
+ }
33
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "fake_image_token": "<|image|>",
3
+ "image_token": "<|image|>",
4
+ "patch_size": 14,
5
+ "processor_class": "Llama4Processor"
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|begin_of_text|>",
3
+ "eos_token": "<|end_of_text|>",
4
+ "pad_token": "<|finetune_right_pad_id|>"
5
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e009b4fcb58eddbabf347e71b9881ea1e6eb72d44e5ea9477c7587df68fd8d
3
+ size 27948580
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff