morgendave's picture
Upload folder using huggingface_hub
58ba52e verified
{
"aggregate_gradients_by_tokens": true,
"alternate_pp_config": true,
"async_batch_iterator": false,
"async_batch_iterator_timeout_s": 600,
"async_checkpointing": true,
"async_eval_ngpus": -1,
"attach_debugpy": false,
"background_nccl_init": false,
"batch_p2p_communication": false,
"batch_size": 2,
"cached_file_unique_prefix": "",
"checkpoint": {
"async_checkpointing_staging_method": "async_copy_async_serialize",
"barrier_timeout_secs": 120,
"checkpoint_barrier_type": "sc",
"checkpoint_gc_use_rmdir": true,
"checkpoint_groups": false,
"checkpoint_server_max_attempts": 10,
"checkpoint_server_num_chunks": 10,
"checkpoint_server_num_threads": 20,
"checkpoint_server_op_timeout_secs": 10.0,
"checkpoint_server_threads": 10,
"checkpoint_server_timeout_secs": 60.0,
"dump_freq_ephemeral": -1,
"eager_init_staging_buffer": false,
"live_checkpointing": false,
"on_demand_checkpointing": false,
"sleep_interval": 10,
"staging_block_every_n_tensors": -1,
"timeout_all_shard_exists": 300,
"timeout_barrier_init_secs": 300,
"timeout_execution": 1800,
"timeout_folder_exists": 300,
"timeout_process_init_secs": 60,
"use_checkpoint_barrier_tcpstore_libuv": true,
"use_checkpoint_barrier_wait_for_all_files": true,
"use_checkpoint_barrier_wait_for_dir": false,
"use_checkpointing_process": true,
"use_shm_manager_for_async_cp": false,
"wait_for_tensor_timeout_s": 120
},
"checkpoint_dump_dir": "/mnt/wsfuse/outputs/TI-draft-17bx16MoE-N_3-grtb7hxqrsf75c",
"collect_et": false,
"context_parallel_size": 1,
"data": "",
"dataloader": {
"always_trim_text": true,
"concurrency_timeout_s": 300,
"concurrent": false,
"datamix": "",
"enable_packing": true,
"image": {
"image_height": 336,
"image_width": 336,
"max_num_chunks": 16,
"patch_height": 14,
"patch_width": 14,
"ps_ratio": 0.5,
"resize_to_max_canvas": false,
"suppress_dataloader_errors": false,
"use_dynamic_transform": true,
"use_pixel_shuffle": true,
"use_tile_separator_tags": true
},
"load_only_tp_zero": false,
"logging_config": {
"log_buffer_size": false,
"log_every_n_steps": 10,
"log_first_batch": false,
"log_full_dataloader_state": false,
"log_metadata": false
},
"max_world_size": null,
"mix_mode": {},
"modality_datamix": {
"image": "/mnt/wsfuse/nikhilmeht/0321/sft/lichengyu/datamixes/llama4/multi_image/round3_sft_mim1_cleaned.json:0.0582033824,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_stem_math_50p_pruned_cleaned.json:0.1513287942,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_chart_50p_pruned_mitigated_blurred_cleaned.json:0.0753349440,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_natural_image_50p_pruned_mitigated_blurred_cleaned.json:0.0753349440,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_stem_non_math_50p_pruned_mitigated_blurred_cleaned.json:0.0527124973,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_diagram_50p_pruned_cleaned.json:0.0169119262,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_table_50p_pruned_cleaned.json:0.0672084340,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_doc_50p_pruned_cleaned.json:0.0419503624,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_structured_infographics_50p_pruned_mitigated_blurred_cleaned.json:0.1172853064,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_others_50p_pruned_cleaned.json:0.0597408302,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_multilingual_50p_pruned_mitigated_blurred_cleaned.json:0.0839007248,/mnt/wsfuse/nikhilmeht/0321/sft/tarun05/l4_sft_r5_datamix/jsons/128e_randomSample/l4_sft_r4_coding_50p_pruned_cleaned.json:0.0002196354,/mnt/wsfuse/nikhilmeht/0321/sft/trangleminh/datamixes/metaai_diverse_vr7500_frr29k_cse3k_pri2700_cleaned.json:0.0224028113,/mnt/wsfuse/nikhilmeht/0321/sft/szha/data_mix/llama4_video_3p_20250211_cleaned.json:0.0296507797,/mnt/wsfuse/nikhilmeht/0321/sft/nextgen_mm/datasets/zhouxy/datamix/refcoco50kx2resizex1_multbox10k_augprompt_vgblurfix10k_o365dedup3f80k_cleaned.json:0.0912836009,/mnt/wsfuse/nikhilmeht/0321/sft/pengchuanzhang/l4_sft_r4_datamix/jsons/grounding/vcr_distill_val_new_cleaned.json:0.0382832843,meta_ai_i18n_syn_gen_image_sft_v2_03_08_allcountry:0.0098835932,gen_ai_mmllm_llama4_video_1p_sft_blurred_fb_ig_v6_no_audio_reencoded_32frames_20250213_mitigated_20250307:0.0665495278,meta_ai_image_sft_dataflywheel_train_hw_sythetic_data_v2_10k:0.0048319789,gen_ai_mmllm_llama4_multi_image_1p_sft_ig_v5_mitigated:0.0665495278",
"speech": "",
"text": "/mnt/wsfuse/users/ashish/yonder3_m20/sag/sft:0.0010000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/yonder2_planner_v5:0.0016000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/yonder2_3_simple_qa_cw_search_trigger_filter:0.0000050000,/mnt/wsfuse/users/ashish/yonder3_m20/agentic_search/yonder2_3_multi_turn_from_mase_no_fdd:0.0000500000,/mnt/wsfuse/users/ashish/yonder3_m20/agentic_search/yonder2_3_sag:0.0004000000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/mlt_search_triggering:0.0001013947,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r1/agentic_search/mlt_no_search_triggering:0.0001783212,/mnt/wsfuse/zihangm/files_from_gsheet/general_helpfulness_english/biography_questions:0.0000300000,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_legal_issues:0.0000119361,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_medical_issues:0.0000119361,/mnt/wsfuse/users/ashish/yonder3/general_helpfulness_english/qrs_wellbeing_issues_social_media:0.0000119361,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/hard_legal_mcq_w_reasoning_sft:0.0053461996,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/autoif_filtered_prompt_v1_v2_filtered_pass_75_format_filter_valid_link_fix_code:0.0036983832,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rewrite_iter1_perfect_frr_tone_filter_format_clean_chunk:0.0446290047,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/general_helpfulness_english/surge_precise_if_critic_rlhf6pt5_rs_perfect_frr_tone_filter_format_clean_chunk:0.0764934234,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/scale_weak_areas:0.0006326655,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/knn_mitigation_1shot_v1:0.0177627691,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_hard_chunked:0.0055737439,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_original_perfect_response_format_clean_medium_chunked:0.0710798804,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_hard_chunked:0.0040405952,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_helpful_if_critic_format_sys_prompt_rewrite_iter1_format_clean_medium_chunked:0.0345643785,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R2/general_helpfulness_english/surge_general_steerability_2024_train_chunk:0.0005767622,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT3R3/general_helpfulness_english/table_yonder_oss_helpfulness_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0002796436,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/surge_factuality:0.0100271872,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/synthetic_factuality:0.0202778248,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/factuality/manual_factuality:0.0010659686,/mnt/wsfuse/users/ashish/yonder3_m20/factuality/factual_rm_scored_dataset_eq_prod_dataset_v4_w_sys_v2:0.0017008537,/mnt/wsfuse/users/ashish/yonder3_m20/factuality/factual_rm_scored_dataset_eq_dpo_prompt_combined_deduped_w_sys_v2:0.0021703659,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_medium:0.0097702019,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_hard:0.0067510071,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/Yonder3_keywords_jittering/YonderTrains/12M/LT2R3/lmsys/lmsys_chat:0.0334625173,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/surge_reasoning:0.0030568351,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/round3_3p_decontaminated_rlhf6_mcq_rscot_50_cjka_fix_nomath_nobio:0.0535004838,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/up_synthetic_verbal_reasoning_405_highq185_few_shot_000:0.0047810467,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/legal_mbe_bar_few_shot_000_format_v3:0.0001168333,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/cpa_few_shot_000_cot_000_format_final_v3:0.0000758074,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/moral_decontaminate:0.0002203347,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/lovish_round6_v2_format_clean:0.0003277076,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/ctg_clean_downsampled0_5_mcq_no_geeks_v3:0.0020298452,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_aime_esbs_100_0_75_M30_SCORED_DECONTAM:0.0002617977,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_harp_esbs_100_0_75_M30_SCORED_DECONTAM:0.0024037928,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_math_train_esbs_100_0_75_M30_SCORED_DECONTAM:0.0101189715,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_omni-math_esbs_100_0_75_M30_SCORED_DECONTAM:0.0014753711,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v1_esbs_100_0_75_M30_SCORED_DECONTAM:0.0152844564,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v2_esbs_100_0_75_M30_SCORED_DECONTAM:0.0174577049,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v3_esbs_100_0_75_M30_SCORED_DECONTAM:0.0096021202,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data-v2___70b_robert___t=1.2___top_p=1.0___min_p=0.03___top_k=30___250106_scale-v4_esbs_100_0_75_M30_SCORED_DECONTAM:0.0072409144,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aime_N20_VERIFIED_SCORED_DECONTAM:0.0001247012,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_aops_N20_VERIFIED_SCORED_DECONTAM:0.0044268003,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_harp_N20_VERIFIED_SCORED_DECONTAM:0.0014522667,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_math_N20_VERIFIED_SCORED_DECONTAM:0.0067446853,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_omni_math_N20_VERIFIED_SCORED_DECONTAM:0.0007385598,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v1_N20_VERIFIED_SCORED_DECONTAM:0.0108277141,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v2_N20_VERIFIED_SCORED_DECONTAM:0.0131738396,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v3_N20_VERIFIED_SCORED_DECONTAM:0.0074489788,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/___mnt___wsfuse___lovish___fennel___post-training___sft-data___viktorkerkez___arpg___unclustered___n20___250105_scale_v4_N20_VERIFIED_SCORED_DECONTAM:0.0045074783,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250114_r1ab_data:0.0013468764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/brainly_arpg_weak_area_mcq_mitigated_final:0.0011736716,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/reasoning_sft_mcq_final_sbs:0.0017399476,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/250106_r1_data_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0072726361,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/careers360_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0080841308,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderPrune/sft_data/hierarchal_clustering_LT3R3_reasoning/sft_dataset_12M_d100_f0_m_rmlt3r2_dmix_l77_sel_on_rewardsf_lrc_2_code_patch_tool_clean_sf_15300/LT3R3_reasoning/reasoning/afanti_40k_M10_RM=oprm-8nodes-lw6wdvf1clmv1c_shuffle=True:0.0127163728,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sql_sft:0.0012350136,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/scale_code_chunk:0.0000529914,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/coding/sft_data_surge_tree_sitter_top_3:0.0702169837,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sft_data_scale_tree_sitter_top_3:0.0086631491,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/turing_tree_sitter_top_3:0.0003084886,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_synthetic_code:0.0430912862,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/suchin_synthetic_code:0.0186100313,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_synthetic_javascript:0.0096131839,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sten_surge_coding_with_exec_mulitpl_synth_240613_v2_format_clean:0.0039608666,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/olivier_coding_synthetic_stackoverflow_inspired_samll_multipl_translation_v2_format_clean:0.0119571667,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/sten_coding_generated_problem_stack_overflow_L3_405B_self_healing_principled_tests_v8_good_v2_format_clean:0.0074501629,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_filtered_inductive_reasoning:0.0009037746,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_output:0.0009181477,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/iopairs_snippets_275k_cruxeval_input:0.0005912409,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/coding/debug_v5:0.0002433103,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/generated_rl_taco_merged_v2_nostep:0.0389798015,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/sujan-model:0.0173581110,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/shiqi-model:0.0173581110,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_easy:0.0059567374,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_medium:0.0069906931,/mnt/wsfuse/zihangm/Yonder3_jittering/coding/code_taco_hard:0.0062303026,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/multilingual_r6_5_mix_rm_mathv1:0.0467876285,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/surge_sft_hindi_romanized:0.0001327595,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/scale_sft_final_format_clean_chunk:0.0014430382,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/multilingual/bio:0.0007215191,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT3R2/multilingual/new_rs_souped_multilinugal_critic_rewrite_data_format_clean:0.0056681995,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/rus_v7:0.0063918702,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/multilingual/zho_v5:0.0065836263,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/Yonder2_r2/multilingual/multi_if_sft_data:0.0021400000,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/tool_sft_fixlc_false_positives:0.0052821011,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/sft_stack_new:0.0091056645,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/12M/LT2R3/tooling/surge_simple_format_clean_chunk:0.0026361677,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/surge_complex_format_clean_chunk:0.0038867533,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/synthetic_format_clean_chunk:0.0008346811,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/apibank_new:0.0002613764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/plugins_new:0.0005888150,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/scale_tools:0.0000051701,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/ablate_system_prompt_gorilla_format_clean:0.0000063190,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/apibank_json_new_ipython_fix:0.0002613764,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/plugins_json_new_ipython_fix:0.0005888150,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/stack_json_fixed_new_v2:0.0091056645,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/swapping_functions_in_two_prompt:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/removing_params_from_ground_truth_function:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/missing_function_in_prompt:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/removing_one_required_param_from_ground_truth_function:0.0016674786,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/int:0.0001667479,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/str:0.0001667479,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/skipping_default_params_v1:0.0001417321,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/default_param_model_gen_data_v2:0.0020826816,/mnt/wsfuse/zihangm/Yonder3_jittering/YonderTrains/12M/LT2R3/tooling/tau_oct26:0.0000035651,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/vr_pdo_p1_lang_0702_chunk:0.0000174005,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tool_safety_sft_vr_image_gen_v3_500_format_clean_chunk:0.0000259024,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tool_safety_multi_turn_sft_vr_web_search_format_clean_chunk:0.0000908000,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/tone_edited_refusal_suppression_chunk:0.0002535260,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/table_yonder_safety_syngen_preachy_tone_sft_partition_is_partition_eq_true:0.0000510113,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_safety_tools_1k_chunk:0.0000393920,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_safety_prompt_pair_bt_pilot_chunk:0.0000677883,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_mlg_pdo_translation_chunk:0.0002083527,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_mlg_data_format_clean_chunk:0.0007095666,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_borderline_tools_chunk:0.0000532217,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/surge_borderline:0.0013781542,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/scale_pair_prompt_0625_chunk:0.0046856109,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/sc_redteam_format_clean_chunk:0.0000061214,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_teaming_mutated_refusal_lcx_v2_final_chunk:0.0000038542,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_teaming_mtl_mlg_v2_chunk:0.0001079172,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_syn_v3_en_0823_vr_final_chunk:0.0000238053,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/red_syn_v2_en_0823_vr_final_chunk:0.0000569059,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/multilingual_safety_sft_reannotated_mtl_es_vi_hi_v1_format_clean_chunk:0.0000168904,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/mh_crsv2_xi_052424_mix_3_chunk:0.0003656941,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/l3p_ssh_cse_multi_chunk:0.0003633702,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/halo_cse_cleaned_chunk:0.0000357079,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/halo_cse_813147913798717_chunk:0.0004524132,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/frr_pdo_p1_lang_0702_chunk:0.0000209146,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/election_can_response_format_clean_chunk:0.0000266959,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cybersec_sft_vr_mitre_format_clean_chunk:0.0000507279,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cybersec_sft_frr_mitre_v2_format_clean_chunk:0.0000592864,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/crs_t0_t1_mtl_synthesized_9k_chunk:0.0004737812,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/crs_t0_t1_en_synthesized_vr_4k_chunk:0.0000683551,/mnt/wsfuse/users/ashish/yonder3/y3m20_keyword_filtered/safety/crs_t0_t1_en_synthesized_frr_6k_chunk:0.0003408119,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/covalent_cse_mlg_adversarial_chunk:0.0003261887,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/cbrne_sft_format_clean_chunk:0.0001082005,/mnt/wsfuse/ankit61/safety_yonder1_map/Yonder3/safety/borderline_rs_strict_frr_mix5_chunk:0.0009482426,/mnt/wsfuse/parvin7/keyword_filtering_y3v2_with_cutoff_dataset_output_v3/kevinyao/proposal/sft/new_qrs_format_cleaned_v4:0.0005078454",
"video": ""
},
"pad_mode": {
"pad_value": 0,
"seq_len": 8192
},
"pin_memory": true,
"prefetch_factor": null,
"progress_reporter_log_interval": 0,
"rng_mode": {},
"shuffle_seed": 1337,
"simulate_training_budget": null,
"speech": null,
"split_mode": {
"discard_text_only": true,
"keep_interval": 2,
"keep_strategy": "put_back"
},
"tail_token_mode": {},
"video": {
"decoder_type": "decord",
"max_num_chunks": null,
"max_video_length": -1,
"min_video_length": -1,
"num_frames_per_video": 32,
"resize_to_max_canvas": false,
"sampling_fps": null,
"use_dynamic_transform": false,
"use_ffmpeg_key_frames": false,
"use_fps": false,
"use_tile_separator_tags": false
},
"weights_update_config": {
"image_loss_weight": 0.75,
"image_weight_schedule": [
[
0,
0.15
],
[
10000,
0.5
]
],
"speech_weight_schedule": null,
"text_loss_weight": null,
"text_weight_schedule": [
[
0,
1.0
],
[
10000,
0.5
]
],
"video_loss_weight": null,
"video_weight_schedule": null
},
"workers_per_gpu": 1
},
"dataset_iteration_limits": null,
"deallocate_pipeline_outputs": false,
"disable_logging": false,
"disable_workers_print": false,
"dtype": "bf16",
"dummy_nccl_init": true,
"dump_dir": "/mnt/wsfuse/outputs/TI-draft-17bx16MoE-N_3-grtb7hxqrsf75c",
"dump_dir_tree_type": "sharded",
"dump_freq": 50,
"dump_profile_traces": true,
"eager_init": true,
"enable_anomaly_detection": false,
"enable_deterministic_training": false,
"enable_loss_tracker": true,
"enable_ods": true,
"enable_pynvml": false,
"et_end_itr": 15,
"et_start_itr": 12,
"eval_freq": -1,
"exp_id": "",
"exp_name": "",
"expert_parallel_size": 1,
"finetuning_dir": "",
"fp32_reduce_scatter": "all",
"gc_collect_freq": 1000,
"gpu_check_level": -1,
"increase_seq": null,
"instruct": {
"no_loss_prompt": false,
"no_loss_truncated": false
},
"instruct_data": "",
"iter_jsonl": {
"buffer_size": 64,
"same_data": false
},
"iter_multi": {
"buffer_size": 64,
"ignore_extra_chunks": true,
"iterate_chunk_by_chunk": false,
"max_precompute": 20,
"multiprocess": true
},
"iter_text_airstore": {
"airstore_max_holding_bundles_limit": 10000,
"airstore_max_resharding_factor": 128,
"airstore_sample_prefetch_limit": 10,
"airstore_seed": 727,
"dataloader_workers_per_gpu": 1,
"load_only_pp_zero": false,
"load_only_tp_zero": false,
"max_world_size": 8192,
"pin_memory": true,
"prefetch_factor": 2,
"simulate_training_budget": null,
"unique_token_fraction": null
},
"iter_type": "multi",
"keep_eval_checkpoints": false,
"keep_n_last_checkpoints": -1,
"load_optimizer_on_finetuning": false,
"log": {
"disable_scalars_tb_write": false,
"log_loss_tracker_to_scuba": false,
"log_scalar_default_log_level": "INFO",
"log_scalar_freq": 100,
"log_scalar_freq_overrides": "fp8:1000, router/modality:1000, router/dataset:1000, params:0,act:0,grads:0,grads_fsdpv2:0,debug:0,verbose_debug:0",
"log_scalar_log_level_overrides": "",
"log_scalar_version": 2.0,
"log_scalars": false,
"log_scalars_to_ods": false,
"log_scalars_to_scuba": false,
"log_tb": true,
"log_tensors": false,
"log_tensors_to_scuba": false,
"online_wandb": false,
"online_wandb_project": null,
"online_wandb_team": null,
"reduce_scalars": false
},
"log_all_steps": true,
"log_batch_checksum": true,
"log_dataloader_state": false,
"log_freq": 1,
"log_position_in_data_queue": true,
"log_updates": true,
"logitwriter": {
"compression_algo": "zstd",
"enable": false,
"index_dtype": "int32",
"logit_dtype": "float32",
"same_day_logits_backup": false,
"speech_topk": 100,
"topk": 128,
"write_lse": true
},
"loss_logging_freq": 10,
"loss_rescaling": false,
"max_image_tiles_per_gpu": 2000,
"mb_recompute_attn": true,
"mb_recompute_fc1_fc3": true,
"mem_snapshot_max_entries": 100000,
"mem_snapshot_profiling_duration": -1,
"mem_snapshot_start_step": -1,
"mem_snapshot_stop_step": -1,
"memory_efficient_pipeline": false,
"model": {
"alpha_depth": "disabled",
"alpha_lrm": 1.0,
"alpha_on_resid": false,
"alpha_separate": false,
"alpha_wdm": 1.0,
"attn_bias_type": "block_causal",
"attn_dropout": 0,
"attn_out_dropout": 0,
"attn_temperature_tuning_floor_scale": null,
"attn_temperature_tuning_layers": null,
"attn_temperature_tuning_q_scale_constant": null,
"attn_to_keep": "all",
"batchify_local_attention_len": null,
"cp_attn_perdoc": false,
"cp_attn_save_global_kv": true,
"custom_bwd": false,
"custom_bwd_sum_first_then_comms": true,
"dialog_len": null,
"dim": 5120,
"efficient_attn": "auto",
"efficient_output": false,
"enable_fsdpv2": true,
"enable_tp_overlapping": false,
"enable_weight_sharding_in_pp": false,
"enable_wgrad_sharding_in_pp": true,
"eos_id": 200001,
"every_n_layers_nope": null,
"experts_choice_moe": {
"auto_scale_F": true,
"capacity_factor": 1.0,
"clamp_above_std": false,
"compute_moe_in_fp64": false,
"drop_and_pad": false,
"enable_lb_free": false,
"enable_lb_loss": false,
"enable_router_zloss": false,
"eval_threshold_std_mult": 0.0,
"eval_with_expert_activation_model": false,
"eval_with_saved_stats": true,
"eval_with_top_k": false,
"expert_act_grad_prop_coeff": 0,
"expert_act_init_std": 0.5,
"expert_act_loss_coeff": 0.0001,
"expert_act_silu": false,
"expert_act_threshold": 0,
"expert_activation_model": false,
"fc1_clamp": null,
"fc2_clamp": null,
"fc3_clamp": null,
"fix_datasource_router_score": "",
"fix_image_router_score": null,
"fix_speech_router_score": null,
"force_looped_impl": false,
"fused_shuffle": true,
"input_scaling": false,
"input_scaling_max_clamp": 2.0,
"input_scaling_min_clamp": -2.0,
"interleave_moe_layer_step": 1,
"is_enabled": false,
"lb_free_coeff": 0.0,
"lb_loss_coeff": null,
"max_experts_per_token": null,
"moe_init_scale": 1.0,
"mult_moe_weight_grads": null,
"norm_expert_output": null,
"num_experts": 16,
"overlap_token_comm": true,
"postgate_experts": false,
"recompute_capacity_factor": null,
"routed_dropout": 0.0,
"router_clamp": null,
"router_kld_reg": 0.0,
"router_padding_coeff": null,
"router_score_gating": "sigmoid",
"router_zloss_coeff": 2.4643796217322647e-19,
"running_stats_ema": 0.99,
"running_stats_sync_freq": 100,
"saved_thresholds_are_post_sigmoid": false,
"sharding_strategy": "tp",
"shuffle_before_assign": false,
"shuffle_freq": 1,
"shuffle_group_size": 32,
"shuffle_level": 3,
"shuffle_with_random_order": true,
"shuffle_within_dp": true,
"sigmoid_in_fp32": true,
"skip_local_shuffle": false,
"std_margin": 15,
"std_margin_skip_last": true,
"std_penalty_coeff": 0.0,
"top_k": 1,
"use_fixed_topk": false,
"use_fixed_topk_bsz": 1,
"use_fsdp": true,
"use_shared_expert": true,
"use_te_in_moe": false,
"use_token_choice": true,
"zero_clamp_grads": true,
"zero_router_grads": false
},
"ffn_dim_multiplier": 1.2,
"ffn_exp": 4.0,
"ffn_in_dropout": 0,
"ffn_out_dropout": 0,
"flex_score_mod": "",
"fp8_amax_compute_algo": "max",
"fp8_amax_history_len": 1024,
"fp8_early_bf16_weight_release": false,
"fp8_fuse_wgrad_accumulation": false,
"fp8_grad_output_dynamic_scale": false,
"fp8_input_dynamic_scale": false,
"fp8_interval": 1,
"fp8_margin": 0,
"fp8_rowwise": false,
"fp8_wgrad": false,
"freeze_decoder": false,
"freeze_patterns": null,
"freeze_vision_encoder": false,
"fsdp_checkpoint_wrap_layer_frequency": 1,
"fsdpv1_flatten_params": true,
"fsdpv2_cast_root_forward_inputs": false,
"fsdpv2_cpu_offload_percentage": null,
"fsdpv2_enable_cpu_offload": false,
"fsdpv2_use_per_pg_streams": true,
"fsdpv2_wrap_pp_model_chunk_only": false,
"fuse_sequence_parallel": true,
"global_attn_cfg": "all",
"head_dim": null,
"high_freq_factor": 1,
"hsdp_replicate_num": 1,
"init": {
"coeff_std": null,
"depth_last": false,
"fixed_std": null,
"no_init": false,
"router_coeff_std": 0.1,
"truncate_std_mult": 2.0,
"use_depth": "current",
"use_gaussian": true
},
"layer_ckpt": "all",
"lc_rope_len": 0,
"lc_rope_prob": 0.0,
"less_layer_first_pp_stage": 0,
"less_layer_last_pp_stage": 0,
"local_attention_window_len": null,
"loss_parallel": true,
"max_length": 2048,
"metap": {
"base_width": 1024.0,
"coeff_std": 1.0,
"m_emb": 1.0,
"metap_mode": "ntp",
"tie_router_bulk_coeff_std": false,
"use_metap": false
},
"modalities": {
"freeze_llm": false,
"image": {
"enable_projection": true,
"encoder_name": "llama4_flash_encoder",
"encoder_params": null,
"freeze_vision_encoder": true,
"image_height": 336,
"image_width": 336,
"patch_height": 14,
"patch_width": 14,
"ps_ratio": 0.5,
"recompute_transformer": true,
"return_intermediate": null,
"use_cached_embeddings": false,
"use_dynamic_transform": true,
"vision_adapter_type": "pixel_shuffle_mlp",
"vision_encoder_ckpt_path": "/mnt/wsfuse/nextgen_mm/vision_encoders/llama4_flash_encoder_final_1023_ema",
"vision_encoding_batch_size": null
},
"speech": {
"append_quantization_output": false,
"data_format_args": {
"disallow_text_free_seg": true,
"emit_text_right_after_sys_start": true,
"enable_speech_text_hybrid": false,
"hybrid_generation_mode": "single_token_emit",
"hybrid_understanding_mode": "streaming",
"jitter_system_prompt": false,
"jitter_system_prompt_today_date": false,
"num_words_in_unit": 1,
"speech_delay": 1,
"system_text_lookahead": 0,
"tool_token_delay_ms_max": 100,
"tool_token_delay_ms_min": 0,
"transfer_dates_to_template": false,
"turn_start_with_white_space": false,
"user_text_delay": 6
},
"discrete_codebooks_size": 65536,
"enable_aux_user_output": false,
"enable_full_duplex": true,
"enable_output": true,
"encoder_device": "cpu",
"freeze_speech_encoder": true,
"is_tokenizer": true,
"load_tokenizer": true,
"share_speech_emb": false,
"speech_encoder": "v2_2411",
"speech_encoder_ckpt_dir": null,
"speech_extend_vocab_size": 256,
"speech_feature_dim": 640,
"speech_output_control_format": "",
"speech_projection_dim": 1536,
"speech_separate_softmax": true,
"speech_train_audio_end": false,
"speech_train_audio_start": false,
"target_speaker_table_size": 0,
"use_discrete_codes": true,
"use_embedding": true,
"use_fp32_for_speech_output": true,
"use_fp64": true,
"use_projection": false,
"user_embedding_by_concat": false,
"user_embedding_by_permutation": true,
"user_projection_use_mlp": false
},
"use_image": false,
"use_speech": false,
"use_video": false
},
"multiple_of": 2048,
"n_heads": 40,
"n_kv_heads": 8,
"n_layers": 3,
"non_linearity": "swiglu",
"nope_no_qk_norm": true,
"norm_affine": true,
"norm_eps": 1e-05,
"norm_type": "rmsnorm",
"num_unfrozen_layers": 0,
"output_size": 202048,
"parallel_decoding": {
"enable_fc_parallelism": true,
"fc_with_bias": false,
"first_and_last_norm_required": true,
"has_parallel_decoding": true,
"parallel_decoding_type": "EAGLE",
"share_input_output_embed_with_target": true
},
"parallel_output_norm": true,
"peft_args": null,
"pp_use_tensor_pool": false,
"pre_norm": true,
"prefetch_weight_latency": 1.0,
"qat_args": null,
"qk_norm_across_heads": false,
"qk_norm_affine": false,
"recompute_attn": false,
"recompute_fc1_out": true,
"recompute_fc3_out": true,
"recompute_q_norm": false,
"rope_attn_scale": false,
"rope_scale_factor": 16,
"rope_theta": 500000.0,
"rope_use_fp32_in_outer_product": true,
"sequence_parallel": true,
"share_emb": false,
"stochastic_depth_p_attn": 0,
"stochastic_depth_p_ffn": 0,
"te_use_fsdp_mixed_precision": true,
"use_flex_attn": false,
"use_fp8": false,
"use_qk_norm": true,
"use_rope": true,
"use_scaled_rope": true,
"use_te_layers": true,
"vocab_parallel": true,
"vocab_size": 202048
},
"model_parallel_size": 8,
"model_precheck": false,
"nan_detector_steps": 0,
"no_final_ckpt": false,
"num_layers_per_virtual_pipeline_stage": null,
"num_microbatches_with_partial_activation_checkpoints": 1,
"number_of_manifold_servers_per_host": 8,
"old_mp": -1,
"old_world_size": -1,
"optim": {
"annealing_step": 10000,
"beta1": 0.9,
"beta2": 0.95,
"clip": 1.0,
"cosine_theta": 1.0,
"cycle_length": 1.0,
"decay_length_fraction": 0.1,
"epsilon": 1e-08,
"exp_factor": 0.5,
"fused": null,
"grad_accumulate_steps": 1,
"independent_weight_decay": false,
"lr": 0.0002,
"lr_min_ratio": 0.1,
"modality_order": "text,vision,speech,speech_full_duplex",
"non_nope_lr_mult": null,
"nope_lr_mult": null,
"scheduler": "constant",
"start_annealing_step": -1,
"use_fp32_copy_optim": true,
"vision_encoder_lr": null,
"vision_projection_lr": null,
"warmup": 400,
"weight_decay": 0.1
},
"optimize_backward_concat": false,
"overlap_p2p_communication": true,
"paft": {
"all_reduce_timeout_grow_ms": 300000,
"all_reduce_timeout_ms": 60000,
"ctran_port_base": 18700,
"enable": false,
"ib_exchange_port_base": 18600,
"max_quorum_num_retries": 5,
"max_step_retries": 5,
"min_replicas_to_run": null,
"qp_connect_timeout_ms": null,
"replica_collective_timeout_s": 600,
"send_recv_timeout_ms": 5000,
"startup_sleep_ms": 10000,
"test_only_barrier_timeout_s": 180,
"test_only_skip_ftar": false
},
"periodic_gpu_check": false,
"pg_tuning_options_from_yaml": "",
"pipeline_parallel_microbatch_size": 1,
"pipeline_parallel_size": 1,
"pipeline_strategy": "dora",
"power_consumer": {
"enable": false,
"run_delay_steps": 0,
"run_duration_steps": 100,
"run_freq": 1000,
"run_mode": "periodic"
},
"pp_num_warm_up_microbatch_ratio": 1.0,
"profile_acc_events": false,
"profile_barrier_timeout_s": 0,
"profile_freq": -1,
"profile_num_steps_active": 1,
"profile_record_shapes": true,
"profile_with_stack": false,
"py_spy_args": {
"active_seconds": 600,
"format": "flamegraph",
"freq": -1,
"rank0_only": true,
"rate": 50,
"start_offset": 10
},
"recompute_all_mb": false,
"reshard_after_forward": true,
"restore_dataloader_position": false,
"root_dump_dir": "/mnt/wsfuse/outputs/xldumps",
"runtime_nccl_timeout_s": 600,
"sample_across_datasets": true,
"seq_len": 8192,
"skip_evals_during_training": true,
"slurm": {
"global_rank": 0,
"is_slurm_job": false,
"role_index": 0,
"role_rank": 0,
"role_replica_count": 1,
"role_world_size": 256,
"world_size": 256
},
"speech_loss": {
"aux_aligned_text_loss": false,
"aux_user_loss_weight": 0.9,
"dual_channel_aux_user_loss_weight": 0.9,
"enable": false,
"force_simulated_sys_loss": true,
"full_duplex_dual_loss_mode": "sample",
"kind": "separate_softmax",
"log_logits": false,
"maybe_tool_token_loss_weight": null,
"perfect_silence_id": null,
"speech_loss_weight": null,
"system_floors_weight": null,
"system_perfect_silence_weight": 0.0,
"system_text_escape_audio_weight": null,
"user_text_escape_audio_weight": null
},
"steps": 1050000,
"text_only_steps": null,
"tokenizer": {
"path": "/mnt/wsfuse/tokenizers/tiktoken/l4_200k_base",
"version": "llama4_tiktoken_v6"
},
"tokenizer_dir": "/mnt/wsfuse/tokenizers/tiktoken",
"torch_seed": 0,
"unlimited_steps": false,
"use_sum_loss": false,
"valid": {
"batch_size": 32,
"debug": false,
"majority_voting": 0,
"n_batches": 100,
"ppl_files_str": "",
"prompt_path": "",
"random_fewshots": false,
"seed": 42,
"seq_len": 2048,
"skip_sanity_check": false,
"tasks_root_dir": "",
"tasks_str": "",
"temperature": 0.0,
"top_k": 0,
"top_p": 0.0,
"use_sampling": false,
"write_eval": false
},
"z_loss_multiplier": 0.0
}