| { | |
| "aggregate_gradients_by_tokens": true, | |
| "alternate_pp_config": true, | |
| "async_batch_iterator": false, | |
| "async_batch_iterator_timeout_s": 600, | |
| "async_checkpointing": true, | |
| "async_eval_ngpus": -1, | |
| "attach_debugpy": false, | |
| "background_nccl_init": false, | |
| "batch_p2p_communication": false, | |
| "batch_size": 2, | |
| "cached_file_unique_prefix": "", | |
| "checkpoint": { | |
| "async_checkpointing_staging_method": "async_copy_async_serialize", | |
| "barrier_timeout_secs": 120, | |
| "checkpoint_barrier_type": "sc", | |
| "checkpoint_gc_use_rmdir": true, | |
| "checkpoint_groups": false, | |
| "checkpoint_server_max_attempts": 10, | |
| "checkpoint_server_num_chunks": 10, | |
| "checkpoint_server_num_threads": 20, | |
| "checkpoint_server_op_timeout_secs": 10.0, | |
| "checkpoint_server_threads": 10, | |
| "checkpoint_server_timeout_secs": 60.0, | |
| "dump_freq_ephemeral": -1, | |
| "eager_init_staging_buffer": false, | |
| "live_checkpointing": false, | |
| "on_demand_checkpointing": false, | |
| "sleep_interval": 10, | |
| "staging_block_every_n_tensors": -1, | |
| "timeout_all_shard_exists": 300, | |
| "timeout_barrier_init_secs": 300, | |
| "timeout_execution": 1800, | |
| "timeout_folder_exists": 300, | |
| "timeout_process_init_secs": 60, | |
| "use_checkpoint_barrier_tcpstore_libuv": true, | |
| "use_checkpoint_barrier_wait_for_all_files": true, | |
| "use_checkpoint_barrier_wait_for_dir": false, | |
| "use_checkpointing_process": true, | |
| "use_shm_manager_for_async_cp": false, | |
| "wait_for_tensor_timeout_s": 120 | |
| }, | |
| "checkpoint_dump_dir": "/mnt/wsfuse/outputs/TI-draft-17bx16MoE-N_3-grtb7hxqrsf75c", | |
| "collect_et": false, | |
| "context_parallel_size": 1, | |
| "data": "", | |
| "dataloader": { | |
| "always_trim_text": true, | |
| "concurrency_timeout_s": 300, | |
| "concurrent": false, | |
| "datamix": "", | |
| "enable_packing": true, | |
| "image": { | |
| "image_height": 336, | |
| "image_width": 336, | |
| "max_num_chunks": 16, | |
| "patch_height": 14, | |
| "patch_width": 14, | |
| "ps_ratio": 0.5, | |
| "resize_to_max_canvas": false, | |
| "suppress_dataloader_errors": false, | |
| "use_dynamic_transform": true, | |
| "use_pixel_shuffle": true, | |
| "use_tile_separator_tags": true | |
| }, | |
| "load_only_tp_zero": false, | |
| "logging_config": { | |
| "log_buffer_size": false, | |
| "log_every_n_steps": 10, | |
| "log_first_batch": false, | |
| "log_full_dataloader_state": false, | |
| "log_metadata": false | |
| }, | |
| "max_world_size": null, | |
| "mix_mode": {}, | |
| "modality_datamix": { | |
| "image": "/mnt/wsfuse/nikhilmeht/0321/sft/lichengyu/datamixes/llama4/multi_image/round3_sft_mim1_cleaned.json:0.0582033824,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_stem_math_50p_pruned_cleaned.json:0.1513287942,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_chart_50p_pruned_mitigated_blurred_cleaned.json:0.0753349440,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_natural_image_50p_pruned_mitigated_blurred_cleaned.json:0.0753349440,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_stem_non_math_50p_pruned_mitigated_blurred_cleaned.json:0.0527124973,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_diagram_50p_pruned_cleaned.json:0.0169119262,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_table_50p_pruned_cleaned.json:0.0672084340,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_doc_50p_pruned_cleaned.json:0.0419503624,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_infographics_50p_pruned_mitigated_blurred_cleaned.json:0.1172853064,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_others_50p_pruned_cleaned.json:0.0597408302,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_multilingual_50p_pruned_mitigated_blurred_cleaned.json:0.0839007248,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_coding_50p_pruned_cleaned.json:0.0002196354,/mnt/wsfuse/nikhilmeht/0321/sft/trangleminh/datamixes/metaai_diverse_vr7500_frr29k_cse3k_pri2700_cleaned.json:0.0224028113,/mnt/wsfuse/nikhilmeht/0321/sft/szha/data_mix/llama4_video_3p_20250211_cleaned.json:0.0296507797,/mnt/wsfuse/nikhilmeht/0321/sft/nextgen_mm/datasets/zhouxy/datamix/refcoco50kx2resizex1_multbox10k_augprompt_vgblurfix10k_o365dedup3f80k_cleaned.json:0.0912836009,/mnt/wsfuse/nikhilmeht/0321/sft/pengchuanzhang/l4_sft_r4_datamix/jsons/grounding/vcr_distill_val_new_cleaned.json:0.0382832843,meta_ai_i18n_syn_gen_image_sft_v2_03_08_allcountry:0.0098835932,gen_ai_mmllm_llama4_video_1p_sft_blurred_fb_ig_v6_no_audio_reencoded_32frames_20250213_mitigated_20250307:0.0665495278,meta_ai_image_sft_dataflywheel_train_hw_sythetic_data_v2_10k:0.0048319789,gen_ai_mmllm_llama4_multi_image_1p_sft_ig_v5_mitigated:0.0665495278", | |
| "speech": "", | |
| "text": "/mnt/wsfuse/users/ashish/yonder3_m20/sag/sft:0.0010000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/yonder2_planner_v5:0.0016000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/yonder2_3_simple_qa_cw_search_trigger_filter:0.0000050000,/mnt/wsfuse/users/ashish/yonder3_m20/agentic_search/yonder2_3_multi_turn_from_mase_no_fdd:0.0000500000,/mnt/wsfuse/users/ashish/yonder3_m20/agentic_search/yonder2_3_sag:0.0004000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/mlt_search_triggering:0.0001013947,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/mlt_no_search_triggering:0.0001783212,/mnt/wsfuse/zihangm/files_from_gsheet/general_helpfulness_english/biography_questions:0.0000300000,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_legal_issues:0.0000119361,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_medical_issues:0.0000119361,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_wellbeing_issues_social_media:0.0000119361,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/hard_legal_mcq_w_reasoning_sft:0.0053461996,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/autoif_filtered_prompt_v1_v2_filtered_pass_75_format_filter_valid_link_fix_code:0.0036983832,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rewrite_iter1_perfect_frr_tone_filter_format_clean_chunk:0.0446290047,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rlhf6pt5_rs_perfect_frr_tone_filter_format_clean_chunk:0.0764934234,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/scale_weak_areas:0.0006326655,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/knn_mitigation_1shot_v1:0.0177627691,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_hard_chunked:0.0055737439,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_medium_chunked:0.0710798804,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_hard_chunked:0.0040405952,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_medium_chunked:0.0345643785,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_general_steerability_2024_train_chunk:0.0005767622,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R3/general_helpfulness_english/table_yonder_oss_helpfulness_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0002796436,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/surge_factuality:0.0100271872,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/synthetic_factuality:0.0202778248,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/manual_factuality:0.0010659686,/mnt/wsfuse/users/ashish/yonder3_m20/factuality/factual_rm_scored_dataset_eq_prod_dataset_v4_w_sys_v2:0.0017008537,/mnt/wsfuse/users/ashish/yonder3_m20/factuality/factual_rm_scored_dataset_eq_dpo_prompt_combined_deduped_w_sys_v2:0.0021703659,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_medium:0.0097702019,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_hard:0.0067510071,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_chat:0.0334625173,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/surge_reasoning:0.0030568351,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/round3_3p_decontaminated_rlhf6_mcq_rscot_50_cjka_fix_nomath_nobio:0.0535004838,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/up_synthetic_verbal_reasoning_405_highq185_few_shot_000:0.0047810467,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/legal_mbe_bar_few_shot_000_format_v3:0.0001168333,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/cpa_few_shot_000_cot_000_format_final_v3:0.0000758074,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/moral_decontaminate:0.0002203347,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/lovish_round6_v2_format_clean:0.0003277076,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/ctg_clean_downsampled0_5_mcq_no_geeks_v3:0.0020298452,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_aime_esbs_100_0_75_M30_SCORED_DECONTAM:0.0002617977,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_harp_esbs_100_0_75_M30_SCORED_DECONTAM:0.0024037928,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_math_train_esbs_100_0_75_M30_SCORED_DECONTAM:0.0101189715,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_omni-math_esbs_100_0_75_M30_SCORED_DECONTAM:0.0014753711,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v1_esbs_100_0_75_M30_SCORED_DECONTAM:0.0152844564,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v2_esbs_100_0_75_M30_SCORED_DECONTAM:0.0174577049,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v3_esbs_100_0_75_M30_SCORED_DECONTAM:0.0096021202,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v4_esbs_100_0_75_M30_SCORED_DECONTAM:0.0072409144,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aime_N20_VERIFIED_SCORED_DECONTAM:0.0001247012,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aops_N20_VERIFIED_SCORED_DECONTAM:0.0044268003,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_harp_N20_VERIFIED_SCORED_DECONTAM:0.0014522667,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_math_N20_VERIFIED_SCORED_DECONTAM:0.0067446853,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_omni_math_N20_VERIFIED_SCORED_DECONTAM:0.0007385598,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v1_N20_VERIFIED_SCORED_DECONTAM:0.0108277141,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v2_N20_VERIFIED_SCORED_DECONTAM:0.0131738396,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v3_N20_VERIFIED_SCORED_DECONTAM:0.0074489788,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v4_N20_VERIFIED_SCORED_DECONTAM:0.0045074783,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250114_r1ab_data:0.0013468764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/brainly_arpg_weak_area_mcq_mitigated_final:0.0011736716,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/reasoning_sft_mcq_final_sbs:0.0017399476,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250106_r1_data_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0072726361,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/careers360_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0080841308,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/afanti_40k_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0127163728,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sql_sft:0.0012350136,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/scale_code_chunk:0.0000529914,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/coding/sft_data_surge_tree_sitter_top_3:0.0702169837,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sft_data_scale_tree_sitter_top_3:0.0086631491,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/turing_tree_sitter_top_3:0.0003084886,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_synthetic_code:0.0430912862,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/suchin_synthetic_code:0.0186100313,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_synthetic_javascript:0.0096131839,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sten_surge_coding_with_exec_mulitpl_synth_240613_v2_format_clean:0.0039608666,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_coding_synthetic_stackoverflow_inspired_samll_multipl_translation_v2_format_clean:0.0119571667,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sten_coding_generated_problem_stack_overflow_L3_405B_self_healing_principled_tests_v8_good_v2_format_clean:0.0074501629,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_filtered_inductive_reasoning:0.0009037746,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_output:0.0009181477,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_input:0.0005912409,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/debug_v5:0.0002433103,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/generated_rl_taco_merged_v2_nostep:0.0389798015,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/sujan-model:0.0173581110,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/shiqi-model:0.0173581110,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_easy:0.0059567374,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_medium:0.0069906931,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_hard:0.0062303026,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/multilingual_r6_5_mix_rm_mathv1:0.0467876285,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/surge_sft_hindi_romanized:0.0001327595,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/scale_sft_final_format_clean_chunk:0.0014430382,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/multilingual/bio:0.0007215191,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT3R2/multilingual/new_rs_souped_multilinugal_critic_rewrite_data_format_clean:0.0056681995,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/rus_v7:0.0063918702,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/zho_v5:0.0065836263,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r2/multilingual/multi_if_sft_data:0.0021400000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/tool_sft_fixlc_false_positives:0.0052821011,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/sft_stack_new:0.0091056645,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/tooling/surge_simple_format_clean_chunk:0.0026361677,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/surge_complex_format_clean_chunk:0.0038867533,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/synthetic_format_clean_chunk:0.0008346811,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/apibank_new:0.0002613764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/plugins_new:0.0005888150,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/scale_tools:0.0000051701,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/ablate_system_prompt_gorilla_format_clean:0.0000063190,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/apibank_json_new_ipython_fix:0.0002613764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/plugins_json_new_ipython_fix:0.0005888150,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/stack_json_fixed_new_v2:0.0091056645,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/swapping_functions_in_two_prompt:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/removing_params_from_ground_truth_function:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/missing_function_in_prompt:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/removing_one_required_param_from_ground_truth_function:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/int:0.0001667479,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/str:0.0001667479,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/skipping_default_params_v1:0.0001417321,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/default_param_model_gen_data_v2:0.0020826816,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/tau_oct26:0.0000035651,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/vr_pdo_p1_lang_0702_chunk:0.0000174005,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tool_safety_sft_vr_image_gen_v3_500_format_clean_chunk:0.0000259024,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tool_safety_multi_turn_sft_vr_web_search_format_clean_chunk:0.0000908000,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tone_edited_refusal_suppression_chunk:0.0002535260,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/table_yonder_safety_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0000510113,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_safety_tools_1k_chunk:0.0000393920,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_safety_prompt_pair_bt_pilot_chunk:0.0000677883,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_mlg_pdo_translation_chunk:0.0002083527,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_mlg_data_format_clean_chunk:0.0007095666,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_borderline_tools_chunk:0.0000532217,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_borderline:0.0013781542,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/scale_pair_prompt_0625_chunk:0.0046856109,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/sc_redteam_format_clean_chunk:0.0000061214,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_teaming_mutated_refusal_lcx_v2_final_chunk:0.0000038542,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_teaming_mtl_mlg_v2_chunk:0.0001079172,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_syn_v3_en_0823_vr_final_chunk:0.0000238053,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_syn_v2_en_0823_vr_final_chunk:0.0000569059,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/multilingual_safety_sft_reannotated_mtl_es_vi_hi_v1_format_clean_chunk:0.0000168904,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/mh_crsv2_xi_052424_mix_3_chunk:0.0003656941,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/l3p_ssh_cse_multi_chunk:0.0003633702,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/halo_cse_cleaned_chunk:0.0000357079,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/halo_cse_813147913798717_chunk:0.0004524132,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/frr_pdo_p1_lang_0702_chunk:0.0000209146,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/election_can_response_format_clean_chunk:0.0000266959,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cybersec_sft_vr_mitre_format_clean_chunk:0.0000507279,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cybersec_sft_frr_mitre_v2_format_clean_chunk:0.0000592864,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/crs_t0_t1_mtl_synthesized_9k_chunk:0.0004737812,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/crs_t0_t1_en_synthesized_vr_4k_chunk:0.0000683551,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/safety/crs_t0_t1_en_synthesized_frr_6k_chunk:0.0003408119,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/covalent_cse_mlg_adversarial_chunk:0.0003261887,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cbrne_sft_format_clean_chunk:0.0001082005,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/borderline_rs_strict_frr_mix5_chunk:0.0009482426,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/kevinyao/proposal/sft/new_qrs_format_cleaned_v4:0.0005078454", | |
| "video": "" | |
| }, | |
| "pad_mode": { | |
| "pad_value": 0, | |
| "seq_len": 8192 | |
| }, | |
| "pin_memory": true, | |
| "prefetch_factor": null, | |
| "progress_reporter_log_interval": 0, | |
| "rng_mode": {}, | |
| "shuffle_seed": 1337, | |
| "simulate_training_budget": null, | |
| "speech": null, | |
| "split_mode": { | |
| "discard_text_only": true, | |
| "keep_interval": 2, | |
| "keep_strategy": "put_back" | |
| }, | |
| "tail_token_mode": {}, | |
| "video": { | |
| "decoder_type": "decord", | |
| "max_num_chunks": null, | |
| "max_video_length": -1, | |
| "min_video_length": -1, | |
| "num_frames_per_video": 32, | |
| "resize_to_max_canvas": false, | |
| "sampling_fps": null, | |
| "use_dynamic_transform": false, | |
| "use_ffmpeg_key_frames": false, | |
| "use_fps": false, | |
| "use_tile_separator_tags": false | |
| }, | |
| "weights_update_config": { | |
| "image_loss_weight": 0.75, | |
| "image_weight_schedule": [ | |
| [ | |
| 0, | |
| 0.15 | |
| ], | |
| [ | |
| 10000, | |
| 0.5 | |
| ] | |
| ], | |
| "speech_weight_schedule": null, | |
| "text_loss_weight": null, | |
| "text_weight_schedule": [ | |
| [ | |
| 0, | |
| 1.0 | |
| ], | |
| [ | |
| 10000, | |
| 0.5 | |
| ] | |
| ], | |
| "video_loss_weight": null, | |
| "video_weight_schedule": null | |
| }, | |
| "workers_per_gpu": 1 | |
| }, | |
| "dataset_iteration_limits": null, | |
| "deallocate_pipeline_outputs": false, | |
| "disable_logging": false, | |
| "disable_workers_print": false, | |
| "dtype": "bf16", | |
| "dummy_nccl_init": true, | |
| "dump_dir": "/mnt/wsfuse/outputs/TI-draft-17bx16MoE-N_3-grtb7hxqrsf75c", | |
| "dump_dir_tree_type": "sharded", | |
| "dump_freq": 50, | |
| "dump_profile_traces": true, | |
| "eager_init": true, | |
| "enable_anomaly_detection": false, | |
| "enable_deterministic_training": false, | |
| "enable_loss_tracker": true, | |
| "enable_ods": true, | |
| "enable_pynvml": false, | |
| "et_end_itr": 15, | |
| "et_start_itr": 12, | |
| "eval_freq": -1, | |
| "exp_id": "", | |
| "exp_name": "", | |
| "expert_parallel_size": 1, | |
| "finetuning_dir": "", | |
| "fp32_reduce_scatter": "all", | |
| "gc_collect_freq": 1000, | |
| "gpu_check_level": -1, | |
| "increase_seq": null, | |
| "instruct": { | |
| "no_loss_prompt": false, | |
| "no_loss_truncated": false | |
| }, | |
| "instruct_data": "", | |
| "iter_jsonl": { | |
| "buffer_size": 64, | |
| "same_data": false | |
| }, | |
| "iter_multi": { | |
| "buffer_size": 64, | |
| "ignore_extra_chunks": true, | |
| "iterate_chunk_by_chunk": false, | |
| "max_precompute": 20, | |
| "multiprocess": true | |
| }, | |
| "iter_text_airstore": { | |
| "airstore_max_holding_bundles_limit": 10000, | |
| "airstore_max_resharding_factor": 128, | |
| "airstore_sample_prefetch_limit": 10, | |
| "airstore_seed": 727, | |
| "dataloader_workers_per_gpu": 1, | |
| "load_only_pp_zero": false, | |
| "load_only_tp_zero": false, | |
| "max_world_size": 8192, | |
| "pin_memory": true, | |
| "prefetch_factor": 2, | |
| "simulate_training_budget": null, | |
| "unique_token_fraction": null | |
| }, | |
| "iter_type": "multi", | |
| "keep_eval_checkpoints": false, | |
| "keep_n_last_checkpoints": -1, | |
| "load_optimizer_on_finetuning": false, | |
| "log": { | |
| "disable_scalars_tb_write": false, | |
| "log_loss_tracker_to_scuba": false, | |
| "log_scalar_default_log_level": "INFO", | |
| "log_scalar_freq": 100, | |
| "log_scalar_freq_overrides": "fp8:1000, router/modality:1000, router/dataset:1000, params:0,act:0,grads:0,grads_fsdpv2:0,debug:0,verbose_debug:0", | |
| "log_scalar_log_level_overrides": "", | |
| "log_scalar_version": 2.0, | |
| "log_scalars": false, | |
| "log_scalars_to_ods": false, | |
| "log_scalars_to_scuba": false, | |
| "log_tb": true, | |
| "log_tensors": false, | |
| "log_tensors_to_scuba": false, | |
| "online_wandb": false, | |
| "online_wandb_project": null, | |
| "online_wandb_team": null, | |
| "reduce_scalars": false | |
| }, | |
| "log_all_steps": true, | |
| "log_batch_checksum": true, | |
| "log_dataloader_state": false, | |
| "log_freq": 1, | |
| "log_position_in_data_queue": true, | |
| "log_updates": true, | |
| "logitwriter": { | |
| "compression_algo": "zstd", | |
| "enable": false, | |
| "index_dtype": "int32", | |
| "logit_dtype": "float32", | |
| "same_day_logits_backup": false, | |
| "speech_topk": 100, | |
| "topk": 128, | |
| "write_lse": true | |
| }, | |
| "loss_logging_freq": 10, | |
| "loss_rescaling": false, | |
| "max_image_tiles_per_gpu": 2000, | |
| "mb_recompute_attn": true, | |
| "mb_recompute_fc1_fc3": true, | |
| "mem_snapshot_max_entries": 100000, | |
| "mem_snapshot_profiling_duration": -1, | |
| "mem_snapshot_start_step": -1, | |
| "mem_snapshot_stop_step": -1, | |
| "memory_efficient_pipeline": false, | |
| "model": { | |
| "alpha_depth": "disabled", | |
| "alpha_lrm": 1.0, | |
| "alpha_on_resid": false, | |
| "alpha_separate": false, | |
| "alpha_wdm": 1.0, | |
| "attn_bias_type": "block_causal", | |
| "attn_dropout": 0, | |
| "attn_out_dropout": 0, | |
| "attn_temperature_tuning_floor_scale": null, | |
| "attn_temperature_tuning_layers": null, | |
| "attn_temperature_tuning_q_scale_constant": null, | |
| "attn_to_keep": "all", | |
| "batchify_local_attention_len": null, | |
| "cp_attn_perdoc": false, | |
| "cp_attn_save_global_kv": true, | |
| "custom_bwd": false, | |
| "custom_bwd_sum_first_then_comms": true, | |
| "dialog_len": null, | |
| "dim": 5120, | |
| "efficient_attn": "auto", | |
| "efficient_output": false, | |
| "enable_fsdpv2": true, | |
| "enable_tp_overlapping": false, | |
| "enable_weight_sharding_in_pp": false, | |
| "enable_wgrad_sharding_in_pp": true, | |
| "eos_id": 200001, | |
| "every_n_layers_nope": null, | |
| "experts_choice_moe": { | |
| "auto_scale_F": true, | |
| "capacity_factor": 1.0, | |
| "clamp_above_std": false, | |
| "compute_moe_in_fp64": false, | |
| "drop_and_pad": false, | |
| "enable_lb_free": false, | |
| "enable_lb_loss": false, | |
| "enable_router_zloss": false, | |
| "eval_threshold_std_mult": 0.0, | |
| "eval_with_expert_activation_model": false, | |
| "eval_with_saved_stats": true, | |
| "eval_with_top_k": false, | |
| "expert_act_grad_prop_coeff": 0, | |
| "expert_act_init_std": 0.5, | |
| "expert_act_loss_coeff": 0.0001, | |
| "expert_act_silu": false, | |
| "expert_act_threshold": 0, | |
| "expert_activation_model": false, | |
| "fc1_clamp": null, | |
| "fc2_clamp": null, | |
| "fc3_clamp": null, | |
| "fix_datasource_router_score": "", | |
| "fix_image_router_score": null, | |
| "fix_speech_router_score": null, | |
| "force_looped_impl": false, | |
| "fused_shuffle": true, | |
| "input_scaling": false, | |
| "input_scaling_max_clamp": 2.0, | |
| "input_scaling_min_clamp": -2.0, | |
| "interleave_moe_layer_step": 1, | |
| "is_enabled": false, | |
| "lb_free_coeff": 0.0, | |
| "lb_loss_coeff": null, | |
| "max_experts_per_token": null, | |
| "moe_init_scale": 1.0, | |
| "mult_moe_weight_grads": null, | |
| "norm_expert_output": null, | |
| "num_experts": 16, | |
| "overlap_token_comm": true, | |
| "postgate_experts": false, | |
| "recompute_capacity_factor": null, | |
| "routed_dropout": 0.0, | |
| "router_clamp": null, | |
| "router_kld_reg": 0.0, | |
| "router_padding_coeff": null, | |
| "router_score_gating": "sigmoid", | |
| "router_zloss_coeff": 2.4643796217322647e-19, | |
| "running_stats_ema": 0.99, | |
| "running_stats_sync_freq": 100, | |
| "saved_thresholds_are_post_sigmoid": false, | |
| "sharding_strategy": "tp", | |
| "shuffle_before_assign": false, | |
| "shuffle_freq": 1, | |
| "shuffle_group_size": 32, | |
| "shuffle_level": 3, | |
| "shuffle_with_random_order": true, | |
| "shuffle_within_dp": true, | |
| "sigmoid_in_fp32": true, | |
| "skip_local_shuffle": false, | |
| "std_margin": 15, | |
| "std_margin_skip_last": true, | |
| "std_penalty_coeff": 0.0, | |
| "top_k": 1, | |
| "use_fixed_topk": false, | |
| "use_fixed_topk_bsz": 1, | |
| "use_fsdp": true, | |
| "use_shared_expert": true, | |
| "use_te_in_moe": false, | |
| "use_token_choice": true, | |
| "zero_clamp_grads": true, | |
| "zero_router_grads": false | |
| }, | |
| "ffn_dim_multiplier": 1.2, | |
| "ffn_exp": 4.0, | |
| "ffn_in_dropout": 0, | |
| "ffn_out_dropout": 0, | |
| "flex_score_mod": "", | |
| "fp8_amax_compute_algo": "max", | |
| "fp8_amax_history_len": 1024, | |
| "fp8_early_bf16_weight_release": false, | |
| "fp8_fuse_wgrad_accumulation": false, | |
| "fp8_grad_output_dynamic_scale": false, | |
| "fp8_input_dynamic_scale": false, | |
| "fp8_interval": 1, | |
| "fp8_margin": 0, | |
| "fp8_rowwise": false, | |
| "fp8_wgrad": false, | |
| "freeze_decoder": false, | |
| "freeze_patterns": null, | |
| "freeze_vision_encoder": false, | |
| "fsdp_checkpoint_wrap_layer_frequency": 1, | |
| "fsdpv1_flatten_params": true, | |
| "fsdpv2_cast_root_forward_inputs": false, | |
| "fsdpv2_cpu_offload_percentage": null, | |
| "fsdpv2_enable_cpu_offload": false, | |
| "fsdpv2_use_per_pg_streams": true, | |
| "fsdpv2_wrap_pp_model_chunk_only": false, | |
| "fuse_sequence_parallel": true, | |
| "global_attn_cfg": "all", | |
| "head_dim": null, | |
| "high_freq_factor": 1, | |
| "hsdp_replicate_num": 1, | |
| "init": { | |
| "coeff_std": null, | |
| "depth_last": false, | |
| "fixed_std": null, | |
| "no_init": false, | |
| "router_coeff_std": 0.1, | |
| "truncate_std_mult": 2.0, | |
| "use_depth": "current", | |
| "use_gaussian": true | |
| }, | |
| "layer_ckpt": "all", | |
| "lc_rope_len": 0, | |
| "lc_rope_prob": 0.0, | |
| "less_layer_first_pp_stage": 0, | |
| "less_layer_last_pp_stage": 0, | |
| "local_attention_window_len": null, | |
| "loss_parallel": true, | |
| "max_length": 2048, | |
| "metap": { | |
| "base_width": 1024.0, | |
| "coeff_std": 1.0, | |
| "m_emb": 1.0, | |
| "metap_mode": "ntp", | |
| "tie_router_bulk_coeff_std": false, | |
| "use_metap": false | |
| }, | |
| "modalities": { | |
| "freeze_llm": false, | |
| "image": { | |
| "enable_projection": true, | |
| "encoder_name": "llama4_flash_encoder", | |
| "encoder_params": null, | |
| "freeze_vision_encoder": true, | |
| "image_height": 336, | |
| "image_width": 336, | |
| "patch_height": 14, | |
| "patch_width": 14, | |
| "ps_ratio": 0.5, | |
| "recompute_transformer": true, | |
| "return_intermediate": null, | |
| "use_cached_embeddings": false, | |
| "use_dynamic_transform": true, | |
| "vision_adapter_type": "pixel_shuffle_mlp", | |
| "vision_encoder_ckpt_path": "/mnt/wsfuse/nextgen_mm/vision_encoders/llama4_flash_encoder_final_1023_ema", | |
| "vision_encoding_batch_size": null | |
| }, | |
| "speech": { | |
| "append_quantization_output": false, | |
| "data_format_args": { | |
| "disallow_text_free_seg": true, | |
| "emit_text_right_after_sys_start": true, | |
| "enable_speech_text_hybrid": false, | |
| "hybrid_generation_mode": "single_token_emit", | |
| "hybrid_understanding_mode": "streaming", | |
| "jitter_system_prompt": false, | |
| "jitter_system_prompt_today_date": false, | |
| "num_words_in_unit": 1, | |
| "speech_delay": 1, | |
| "system_text_lookahead": 0, | |
| "tool_token_delay_ms_max": 100, | |
| "tool_token_delay_ms_min": 0, | |
| "transfer_dates_to_template": false, | |
| "turn_start_with_white_space": false, | |
| "user_text_delay": 6 | |
| }, | |
| "discrete_codebooks_size": 65536, | |
| "enable_aux_user_output": false, | |
| "enable_full_duplex": true, | |
| "enable_output": true, | |
| "encoder_device": "cpu", | |
| "freeze_speech_encoder": true, | |
| "is_tokenizer": true, | |
| "load_tokenizer": true, | |
| "share_speech_emb": false, | |
| "speech_encoder": "v2_2411", | |
| "speech_encoder_ckpt_dir": null, | |
| "speech_extend_vocab_size": 256, | |
| "speech_feature_dim": 640, | |
| "speech_output_control_format": "", | |
| "speech_projection_dim": 1536, | |
| "speech_separate_softmax": true, | |
| "speech_train_audio_end": false, | |
| "speech_train_audio_start": false, | |
| "target_speaker_table_size": 0, | |
| "use_discrete_codes": true, | |
| "use_embedding": true, | |
| "use_fp32_for_speech_output": true, | |
| "use_fp64": true, | |
| "use_projection": false, | |
| "user_embedding_by_concat": false, | |
| "user_embedding_by_permutation": true, | |
| "user_projection_use_mlp": false | |
| }, | |
| "use_image": false, | |
| "use_speech": false, | |
| "use_video": false | |
| }, | |
| "multiple_of": 2048, | |
| "n_heads": 40, | |
| "n_kv_heads": 8, | |
| "n_layers": 3, | |
| "non_linearity": "swiglu", | |
| "nope_no_qk_norm": true, | |
| "norm_affine": true, | |
| "norm_eps": 1e-05, | |
| "norm_type": "rmsnorm", | |
| "num_unfrozen_layers": 0, | |
| "output_size": 202048, | |
| "parallel_decoding": { | |
| "enable_fc_parallelism": true, | |
| "fc_with_bias": false, | |
| "first_and_last_norm_required": true, | |
| "has_parallel_decoding": true, | |
| "parallel_decoding_type": "EAGLE", | |
| "share_input_output_embed_with_target": true | |
| }, | |
| "parallel_output_norm": true, | |
| "peft_args": null, | |
| "pp_use_tensor_pool": false, | |
| "pre_norm": true, | |
| "prefetch_weight_latency": 1.0, | |
| "qat_args": null, | |
| "qk_norm_across_heads": false, | |
| "qk_norm_affine": false, | |
| "recompute_attn": false, | |
| "recompute_fc1_out": true, | |
| "recompute_fc3_out": true, | |
| "recompute_q_norm": false, | |
| "rope_attn_scale": false, | |
| "rope_scale_factor": 16, | |
| "rope_theta": 500000.0, | |
| "rope_use_fp32_in_outer_product": true, | |
| "sequence_parallel": true, | |
| "share_emb": false, | |
| "stochastic_depth_p_attn": 0, | |
| "stochastic_depth_p_ffn": 0, | |
| "te_use_fsdp_mixed_precision": true, | |
| "use_flex_attn": false, | |
| "use_fp8": false, | |
| "use_qk_norm": true, | |
| "use_rope": true, | |
| "use_scaled_rope": true, | |
| "use_te_layers": true, | |
| "vocab_parallel": true, | |
| "vocab_size": 202048 | |
| }, | |
| "model_parallel_size": 8, | |
| "model_precheck": false, | |
| "nan_detector_steps": 0, | |
| "no_final_ckpt": false, | |
| "num_layers_per_virtual_pipeline_stage": null, | |
| "num_microbatches_with_partial_activation_checkpoints": 1, | |
| "number_of_manifold_servers_per_host": 8, | |
| "old_mp": -1, | |
| "old_world_size": -1, | |
| "optim": { | |
| "annealing_step": 10000, | |
| "beta1": 0.9, | |
| "beta2": 0.95, | |
| "clip": 1.0, | |
| "cosine_theta": 1.0, | |
| "cycle_length": 1.0, | |
| "decay_length_fraction": 0.1, | |
| "epsilon": 1e-08, | |
| "exp_factor": 0.5, | |
| "fused": null, | |
| "grad_accumulate_steps": 1, | |
| "independent_weight_decay": false, | |
| "lr": 0.0002, | |
| "lr_min_ratio": 0.1, | |
| "modality_order": "text,vision,speech,speech_full_duplex", | |
| "non_nope_lr_mult": null, | |
| "nope_lr_mult": null, | |
| "scheduler": "constant", | |
| "start_annealing_step": -1, | |
| "use_fp32_copy_optim": true, | |
| "vision_encoder_lr": null, | |
| "vision_projection_lr": null, | |
| "warmup": 400, | |
| "weight_decay": 0.1 | |
| }, | |
| "optimize_backward_concat": false, | |
| "overlap_p2p_communication": true, | |
| "paft": { | |
| "all_reduce_timeout_grow_ms": 300000, | |
| "all_reduce_timeout_ms": 60000, | |
| "ctran_port_base": 18700, | |
| "enable": false, | |
| "ib_exchange_port_base": 18600, | |
| "max_quorum_num_retries": 5, | |
| "max_step_retries": 5, | |
| "min_replicas_to_run": null, | |
| "qp_connect_timeout_ms": null, | |
| "replica_collective_timeout_s": 600, | |
| "send_recv_timeout_ms": 5000, | |
| "startup_sleep_ms": 10000, | |
| "test_only_barrier_timeout_s": 180, | |
| "test_only_skip_ftar": false | |
| }, | |
| "periodic_gpu_check": false, | |
| "pg_tuning_options_from_yaml": "", | |
| "pipeline_parallel_microbatch_size": 1, | |
| "pipeline_parallel_size": 1, | |
| "pipeline_strategy": "dora", | |
| "power_consumer": { | |
| "enable": false, | |
| "run_delay_steps": 0, | |
| "run_duration_steps": 100, | |
| "run_freq": 1000, | |
| "run_mode": "periodic" | |
| }, | |
| "pp_num_warm_up_microbatch_ratio": 1.0, | |
| "profile_acc_events": false, | |
| "profile_barrier_timeout_s": 0, | |
| "profile_freq": -1, | |
| "profile_num_steps_active": 1, | |
| "profile_record_shapes": true, | |
| "profile_with_stack": false, | |
| "py_spy_args": { | |
| "active_seconds": 600, | |
| "format": "flamegraph", | |
| "freq": -1, | |
| "rank0_only": true, | |
| "rate": 50, | |
| "start_offset": 10 | |
| }, | |
| "recompute_all_mb": false, | |
| "reshard_after_forward": true, | |
| "restore_dataloader_position": false, | |
| "root_dump_dir": "/mnt/wsfuse/outputs/xldumps", | |
| "runtime_nccl_timeout_s": 600, | |
| "sample_across_datasets": true, | |
| "seq_len": 8192, | |
| "skip_evals_during_training": true, | |
| "slurm": { | |
| "global_rank": 0, | |
| "is_slurm_job": false, | |
| "role_index": 0, | |
| "role_rank": 0, | |
| "role_replica_count": 1, | |
| "role_world_size": 256, | |
| "world_size": 256 | |
| }, | |
| "speech_loss": { | |
| "aux_aligned_text_loss": false, | |
| "aux_user_loss_weight": 0.9, | |
| "dual_channel_aux_user_loss_weight": 0.9, | |
| "enable": false, | |
| "force_simulated_sys_loss": true, | |
| "full_duplex_dual_loss_mode": "sample", | |
| "kind": "separate_softmax", | |
| "log_logits": false, | |
| "maybe_tool_token_loss_weight": null, | |
| "perfect_silence_id": null, | |
| "speech_loss_weight": null, | |
| "system_floors_weight": null, | |
| "system_perfect_silence_weight": 0.0, | |
| "system_text_escape_audio_weight": null, | |
| "user_text_escape_audio_weight": null | |
| }, | |
| "steps": 1050000, | |
| "text_only_steps": null, | |
| "tokenizer": { | |
| "path": "/mnt/wsfuse/tokenizers/tiktoken/l4_200k_base", | |
| "version": "llama4_tiktoken_v6" | |
| }, | |
| "tokenizer_dir": "/mnt/wsfuse/tokenizers/tiktoken", | |
| "torch_seed": 0, | |
| "unlimited_steps": false, | |
| "use_sum_loss": false, | |
| "valid": { | |
| "batch_size": 32, | |
| "debug": false, | |
| "majority_voting": 0, | |
| "n_batches": 100, | |
| "ppl_files_str": "", | |
| "prompt_path": "", | |
| "random_fewshots": false, | |
| "seed": 42, | |
| "seq_len": 2048, | |
| "skip_sanity_check": false, | |
| "tasks_root_dir": "", | |
| "tasks_str": "", | |
| "temperature": 0.0, | |
| "top_k": 0, | |
| "top_p": 0.0, | |
| "use_sampling": false, | |
| "write_eval": false | |
| }, | |
| "z_loss_multiplier": 0.0 | |
| } |