Junyi42 commited on 4 days ago

Commit

802680b

verified ·

1 Parent(s): 2f3bbdc

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test

Browse files

Files changed (21) hide show

.gitattributes +2 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug-internal.log +8 -8
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug.log +23 -23
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml +1 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log +174 -200
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml +437 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log +601 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/wandb-metadata.json +1 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/wandb-summary.json +1 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +1 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +2 -2
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt +354 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +7 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log +8 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log +24 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +3 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt +354 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +7 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log +8 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log +24 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +3 -0

.gitattributes CHANGED Viewed

@@ -210,3 +210,5 @@ checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_ji
 checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20260104_091756-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
 checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
 checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/wandb/offline-run-20260104_093254-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text

 checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20260104_091756-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
 checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
 checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/wandb/offline-run-20260104_093254-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
+checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
+checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text

checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug-internal.log CHANGED Viewed

@@ -1,8 +1,8 @@
-{"time":"2026-01-04T09:04:29.920066674Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
-{"time":"2026-01-04T09:04:30.241997449Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
-{"time":"2026-01-04T09:04:30.242081506Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
-{"time":"2026-01-04T09:04:30.242332296Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
-{"time":"2026-01-04T09:04:30.242363843Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
-{"time":"2026-01-04T09:04:30.242420429Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
-{"time":"2026-01-04T09:04:30.242455Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
-{"time":"2026-01-04T09:04:30.243628157Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}

+{"time":"2026-01-04T09:41:58.817034119Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-01-04T09:41:59.538864599Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
+{"time":"2026-01-04T09:41:59.538957051Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539033043Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539155721Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539184904Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539218617Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539987313Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}

checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug.log CHANGED Viewed

@@ -1,24 +1,24 @@
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_setup.py:_flush():80] Configure stats pid to 13230
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_setup.py:_flush():80] Loading settings from environment variables
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_init.py:init():841] calling init triggers
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
 config: {'_wandb': {}}
-2026-01-04 09:04:29,300 INFO    MainThread:13230 [wandb_init.py:init():889] starting backend
-2026-01-04 09:04:29,745 INFO    MainThread:13230 [wandb_init.py:init():892] sending inform_init request
-2026-01-04 09:04:29,753 INFO    MainThread:13230 [wandb_init.py:init():900] backend started and connected
-2026-01-04 09:04:29,755 INFO    MainThread:13230 [wandb_init.py:init():970] updated telemetry
-2026-01-04 09:04:29,763 INFO    MainThread:13230 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
-2026-01-04 09:04:30,245 INFO    MainThread:13230 [wandb_init.py:init():1041] starting run threads in backend
-2026-01-04 09:04:30,693 INFO    MainThread:13230 [wandb_run.py:_console_start():2521] atexit reg
-2026-01-04 09:04:30,693 INFO    MainThread:13230 [wandb_run.py:_redirect():2369] redirect: wrap_raw
-2026-01-04 09:04:30,693 INFO    MainThread:13230 [wandb_run.py:_redirect():2438] Wrapping output streams.
-2026-01-04 09:04:30,693 INFO    MainThread:13230 [wandb_run.py:_redirect():2461] Redirects installed.
-2026-01-04 09:04:30,697 INFO    MainThread:13230 [wandb_init.py:init():1081] run started, returning control to user process
-2026-01-04 09:04:30,699 INFO    MainThread:13230 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
-2026-01-04 09:04:30,700 INFO    MainThread:13230 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
-2026-01-04 09:04:30,700 INFO    MainThread:13230 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}

+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Configure stats pid to 49730
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:init():841] calling init triggers
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
 config: {'_wandb': {}}
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:init():889] starting backend
+2026-01-04 09:41:58,797 INFO    MainThread:49730 [wandb_init.py:init():892] sending inform_init request
+2026-01-04 09:41:58,805 INFO    MainThread:49730 [wandb_init.py:init():900] backend started and connected
+2026-01-04 09:41:58,807 INFO    MainThread:49730 [wandb_init.py:init():970] updated telemetry
+2026-01-04 09:41:58,816 INFO    MainThread:49730 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-01-04 09:41:59,542 INFO    MainThread:49730 [wandb_init.py:init():1041] starting run threads in backend
+2026-01-04 09:41:59,917 INFO    MainThread:49730 [wandb_run.py:_console_start():2521] atexit reg
+2026-01-04 09:41:59,918 INFO    MainThread:49730 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-01-04 09:41:59,918 INFO    MainThread:49730 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-01-04 09:41:59,918 INFO    MainThread:49730 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-01-04 09:41:59,921 INFO    MainThread:49730 [wandb_init.py:init():1081] run started, returning control to user process
+2026-01-04 09:41:59,922 INFO    MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
+2026-01-04 09:41:59,923 INFO    MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
+2026-01-04 09:41:59,924 INFO    MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}

checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml CHANGED Viewed

@@ -34,6 +34,7 @@ _wandb:
       - 4
       - 13
       - 14
       - 42
       - 61
       4: 3.11.10

       - 4
       - 13
       - 14
+      - 37
       - 42
       - 61
       4: 3.11.10

@@ -1,176 +1,3 @@
-FullyShardedDataParallel(
-  (_fsdp_wrapped_module): Bagel(
-    (language_model): Qwen2ForCausalLM(
-      (model): Qwen2Model(
-        (embed_tokens): Embedding(152064, 3584)
-        (layers): ModuleList(
-          (0-27): 28 x FullyShardedDataParallel(
-            (_fsdp_wrapped_module): CheckpointWrapper(
-              (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
-                (self_attn): PackedAttentionMoT(
-                  (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
-                  (k_proj): Linear(in_features=3584, out_features=512, bias=True)
-                  (v_proj): Linear(in_features=3584, out_features=512, bias=True)
-                  (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
-                  (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
-                  (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
-                  (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
-                  (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
-                  (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
-                  (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
-                  (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
-                  (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
-                )
-                (mlp): Qwen2MLP(
-                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
-                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
-                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
-                  (act_fn): SiLU()
-                )
-                (mlp_moe_gen): Qwen2MLP(
-                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
-                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
-                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
-                  (act_fn): SiLU()
-                )
-                (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
-                (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
-                (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
-                (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
-              )
-            )
-          )
-        )
-        (norm): Qwen2RMSNorm((3584,), eps=1e-06)
-        (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
-        (rotary_emb): Qwen2RotaryEmbedding()
-      )
-      (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
-    )
-    (time_embedder): FullyShardedDataParallel(
-      (_fsdp_wrapped_module): TimestepEmbedder(
-        (mlp): Sequential(
-          (0): Linear(in_features=256, out_features=3584, bias=True)
-          (1): SiLU()
-          (2): Linear(in_features=3584, out_features=3584, bias=True)
-        )
-      )
-    )
-    (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
-    (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
-    (latent_pos_embed): FullyShardedDataParallel(
-      (_fsdp_wrapped_module): PositionEmbedding()
-    )
-    (vit_model): SiglipVisionModel(
-      (vision_model): FullyShardedDataParallel(
-        (_fsdp_wrapped_module): SiglipVisionTransformer(
-          (embeddings): SiglipVisionEmbeddings(
-            (position_embedding): Embedding(4900, 1152)
-            (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
-          )
-          (encoder): SiglipEncoder(
-            (layers): ModuleList(
-              (0-25): 26 x FullyShardedDataParallel(
-                (_fsdp_wrapped_module): CheckpointWrapper(
-                  (_checkpoint_wrapped_module): SiglipEncoderLayer(
-                    (self_attn): SiglipFlashAttention2(
-                      (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
-                      (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
-                      (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
-                      (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
-                    )
-                    (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
-                    (mlp): SiglipMLP(
-                      (activation_fn): PytorchGELUTanh()
-                      (fc1): Linear(in_features=1152, out_features=4304, bias=True)
-                      (fc2): Linear(in_features=4304, out_features=1152, bias=True)
-                    )
-                    (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
-                  )
-                )
-              )
-            )
-          )
-          (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
-        )
-      )
-    )
-    (connector): FullyShardedDataParallel(
-      (_fsdp_wrapped_module): CheckpointWrapper(
-        (_checkpoint_wrapped_module): MLPconnector(
-          (activation_fn): PytorchGELUTanh()
-          (fc1): Linear(in_features=1152, out_features=3584, bias=True)
-          (fc2): Linear(in_features=3584, out_features=3584, bias=True)
-        )
-      )
-    )
-    (vit_pos_embed): FullyShardedDataParallel(
-      (_fsdp_wrapped_module): PositionEmbedding()
-    )
-  )
-)
-_flat_param True
-language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-time_embedder._fsdp_wrapped_module._flat_param True
-latent_pos_embed._fsdp_wrapped_module._flat_param False
-vit_model.vision_model._fsdp_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
-vit_pos_embed._fsdp_wrapped_module._flat_param False
-Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
-Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
 wandb: Detected [huggingface_hub.inference] in use.
 wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
 wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1020,6 +847,179 @@ ImportError: cannot import name 'NaiveCache' from 'modeling.bagel' (/home/cloudu
 [[34m2026-01-03 11:25:55[39m] (step=0000804) Train Loss mse: 0.0543, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
 [[34m2026-01-03 11:26:08[39m] (step=0000805) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 [[34m2026-01-03 11:26:20[39m] (step=0000806) Train Loss mse: 0.0528, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
 [[34m2026-01-03 11:26:33[39m] (step=0000807) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 [[34m2026-01-03 11:26:50[39m] (step=0000808) Train Loss mse: 0.0267, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 [[34m2026-01-03 11:27:03[39m] (step=0000809) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
@@ -3097,33 +3097,7 @@ ImportError: cannot import name 'NaiveCache' from 'modeling.bagel' (/home/cloudu
 [[34m2026-01-03 19:16:05[39m] (step=0002814) Train Loss mse: 0.0356, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 [[34m2026-01-03 19:16:18[39m] (step=0002815) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 [[34m2026-01-03 19:16:34[39m] (step=0002816) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
-[[34m2026-01-03 19:16:48[39m] (step=0002817) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:17:01[39m] (step=0002818) Train Loss mse: 0.0298, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:17:14[39m] (step=0002819) Train Loss mse: 0.0316, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:17:27[39m] (step=0002820) Train Loss mse: 0.0282, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:17:40[39m] (step=0002821) Train Loss mse: 0.0263, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:17:54[39m] (step=0002822) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:18:05[39m] (step=0002823) Train Loss mse: 0.0302, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
-[[34m2026-01-03 19:18:17[39m] (step=0002824) Train Loss mse: 0.0385, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:18:33[39m] (step=0002825) Train Loss mse: 0.0330, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
-[[34m2026-01-03 19:18:46[39m] (step=0002826) Train Loss mse: 0.0317, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:19:02[39m] (step=0002827) Train Loss mse: 0.0244, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
-[[34m2026-01-03 19:19:15[39m] (step=0002828) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:19:29[39m] (step=0002829) Train Loss mse: 0.0240, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:19:40[39m] (step=0002830) Train Loss mse: 0.0307, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
-[[34m2026-01-03 19:19:56[39m] (step=0002831) Train Loss mse: 0.0293, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
-[[34m2026-01-03 19:20:10[39m] (step=0002832) Train Loss mse: 0.0304, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:20:26[39m] (step=0002833) Train Loss mse: 0.0202, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
-[[34m2026-01-03 19:20:38[39m] (step=0002834) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:20:51[39m] (step=0002835) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:21:06[39m] (step=0002836) Train Loss mse: 0.0237, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:21:19[39m] (step=0002837) Train Loss mse: 0.0333, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:21:33[39m] (step=0002838) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:21:47[39m] (step=0002839) Train Loss mse: 0.0301, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:21:59[39m] (step=0002840) Train Loss mse: 0.0299, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
-[[34m2026-01-03 19:22:14[39m] (step=0002841) Train Loss mse: 0.0305, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:22:28[39m] (step=0002842) Train Loss mse: 0.0285, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
-[[34m2026-01-03 19:22:44[39m] (step=0002843) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 [[34m2026-01-03 19:22:57[39m] (step=0002844) Train Loss mse: 0.0234, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 [[34m2026-01-03 19:23:13[39m] (step=0002845) Train Loss mse: 0.0283, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 [[34m2026-01-03 19:23:29[39m] (step=0002846) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,

 wandb: Detected [huggingface_hub.inference] in use.
 wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
 wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 [[34m2026-01-03 11:25:55[39m] (step=0000804) Train Loss mse: 0.0543, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
 [[34m2026-01-03 11:26:08[39m] (step=0000805) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 [[34m2026-01-03 11:26:20[39m] (step=0000806) Train Loss mse: 0.0528, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
+FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Bagel(
+    (language_model): Qwen2ForCausalLM(
+      (model): Qwen2Model(
+        (embed_tokens): Embedding(152064, 3584)
+        (layers): ModuleList(
+          (0-27): 28 x FullyShardedDataParallel(
+            (_fsdp_wrapped_module): CheckpointWrapper(
+              (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
+                (self_attn): PackedAttentionMoT(
+                  (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+                  (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+                  (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+                  (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+                  (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
+                  (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
+                  (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
+                  (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
+                  (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
+                  (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
+                  (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
+                  (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
+                )
+                (mlp): Qwen2MLP(
+                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+                  (act_fn): SiLU()
+                )
+                (mlp_moe_gen): Qwen2MLP(
+                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+                  (act_fn): SiLU()
+                )
+                (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
+                (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+                (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
+                (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+              )
+            )
+          )
+        )
+        (norm): Qwen2RMSNorm((3584,), eps=1e-06)
+        (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+        (rotary_emb): Qwen2RotaryEmbedding()
+      )
+      (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
+    )
+    (time_embedder): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): TimestepEmbedder(
+        (mlp): Sequential(
+          (0): Linear(in_features=256, out_features=3584, bias=True)
+          (1): SiLU()
+          (2): Linear(in_features=3584, out_features=3584, bias=True)
+        )
+      )
+    )
+    (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
+    (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
+    (latent_pos_embed): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): PositionEmbedding()
+    )
+    (vit_model): SiglipVisionModel(
+      (vision_model): FullyShardedDataParallel(
+        (_fsdp_wrapped_module): SiglipVisionTransformer(
+          (embeddings): SiglipVisionEmbeddings(
+            (position_embedding): Embedding(4900, 1152)
+            (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
+          )
+          (encoder): SiglipEncoder(
+            (layers): ModuleList(
+              (0-25): 26 x FullyShardedDataParallel(
+                (_fsdp_wrapped_module): CheckpointWrapper(
+                  (_checkpoint_wrapped_module): SiglipEncoderLayer(
+                    (self_attn): SiglipFlashAttention2(
+                      (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+                    (mlp): SiglipMLP(
+                      (activation_fn): PytorchGELUTanh()
+                      (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                      (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+                  )
+                )
+              )
+            )
+          )
+          (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+        )
+      )
+    )
+    (connector): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): CheckpointWrapper(
+        (_checkpoint_wrapped_module): MLPconnector(
+          (activation_fn): PytorchGELUTanh()
+          (fc1): Linear(in_features=1152, out_features=3584, bias=True)
+          (fc2): Linear(in_features=3584, out_features=3584, bias=True)
+        )
+      )
+    )
+    (vit_pos_embed): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): PositionEmbedding()
+    )
+  )
+)
+_flat_param True
+language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+time_embedder._fsdp_wrapped_module._flat_param True
+latent_pos_embed._fsdp_wrapped_module._flat_param False
+vit_model.vision_model._fsdp_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_pos_embed._fsdp_wrapped_module._flat_param False
+Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
+Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
 [[34m2026-01-03 11:26:33[39m] (step=0000807) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 [[34m2026-01-03 11:26:50[39m] (step=0000808) Train Loss mse: 0.0267, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 [[34m2026-01-03 11:27:03[39m] (step=0000809) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 [[34m2026-01-03 19:16:05[39m] (step=0002814) Train Loss mse: 0.0356, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 [[34m2026-01-03 19:16:18[39m] (step=0002815) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 [[34m2026-01-03 19:16:34[39m] (step=0002816) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-03 19:16:48
 [[34m2026-01-03 19:22:57[39m] (step=0002844) Train Loss mse: 0.0234, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 [[34m2026-01-03 19:23:13[39m] (step=0002845) Train Loss mse: 0.0283, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 [[34m2026-01-03 19:23:29[39m] (step=0002846) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,

checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml ADDED Viewed

	@@ -0,0 +1,437 @@

+wandb_version: 1
+_wandb:
+  desc: null
+  value:
+    python_version: 3.11.10
+    cli_version: 0.23.1
+    framework: huggingface
+    huggingface_version: 4.49.0
+    is_jupyter_run: false
+    is_kaggle_kernel: false
+    start_time: 1767517469
+    t:
+      1:
+      - 1
+      - 5
+      - 11
+      - 41
+      - 49
+      - 53
+      - 71
+      - 105
+      2:
+      - 1
+      - 5
+      - 11
+      - 41
+      - 49
+      - 53
+      - 71
+      - 105
+      3:
+      - 4
+      - 13
+      - 14
+      - 37
+      - 42
+      - 61
+      4: 3.11.10
+      5: 0.23.1
+      6: 4.49.0
+      13: linux-x86_64
+    e:
+      c4f1w52emnh3bkfwabjlnv9ozcfaekz0:
+        os: Linux-6.6.93+-x86_64-with-glibc2.35
+        python: CPython 3.11.10
+        started_at: '2026-01-04T09:04:29.298919Z'
+        args:
+        - --dataset_config_file
+        - ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
+        - --eval_dataset_config_file
+        - ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
+        - --viz_dataset_config_file
+        - ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
+        - --inference_hash_file
+        - /home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json
+        - --train_data_dir
+        - /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
+        - --train_jsonl_path
+        - /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
+        - --eval_data_dir
+        - /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
+        - --eval_jsonl_path
+        - /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
+        - --model_path
+        - /home/clouduser/Code/Models/BAGEL-7B-MoT
+        - --layer_module
+        - Qwen2MoTDecoderLayer
+        - --max_latent_size
+        - '64'
+        - --resume-from
+        - /home/clouduser/Code/Models/BAGEL-7B-MoT
+        - --finetune_from_hf
+        - 'True'
+        - --auto_resume
+        - 'False'
+        - --resume-model-only
+        - 'True'
+        - --finetune-from-ema
+        - 'True'
+        - --log_every
+        - '1'
+        - --lr
+        - 2e-5
+        - --warmup_steps
+        - '300'
+        - --lr_scheduler
+        - cosine
+        - --num_worker
+        - '1'
+        - --expected_num_tokens
+        - '20000'
+        - --max_num_tokens
+        - '20000'
+        - --max_num_tokens_per_sample
+        - '20000'
+        - --visual_und
+        - 'True'
+        - --save_every
+        - '2500'
+        - --total_steps
+        - '5000'
+        - --text_cond_dropout_prob
+        - '0.0'
+        - --vae_cond_dropout_prob
+        - '0.0'
+        - --vit_cond_dropout_prob
+        - '0.0'
+        - --checkpoint_dir
+        - /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
+        - --wandb_project
+        - bagel
+        - --wandb_name
+        - vlm_gym_jigsaw_one_img_lr2e_5_mse_only
+        - --wandb_dir
+        - /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
+        - --wandb_offline
+        - 'True'
+        - --viz_every
+        - '10'
+        program: /home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py
+        code_path: train/pretrain_unified_navit.py
+        code_path_local: train/pretrain_unified_navit.py
+        git:
+          remote_url: https://github.com/para-lost/unified_world_model
+          commit: be2c19982b710041da81a85f55c2877ea0e2e2c6
+        root: /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
+        host: junyizhang-launch-new-219635706-1-0
+        executable: /opt/conda/bin/python3.11
+        cpu_count: 48
+        cpu_count_logical: 96
+        gpu_type: NVIDIA A100-SXM4-80GB
+        gpu_count: 8
+        disk:
+          /:
+            total: '1052461830144'
+            used: '261623291904'
+        memory:
+          total: '1437332611072'
+        gpu_nvidia:
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-71d2c6f0-c9e9-2110-f69b-f7fd558363b2
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-a25620fe-6924-8936-d5a8-9dfb1c7177e8
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-2b69635d-5f31-ec83-bcde-b1df07b60307
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-4469aac3-d575-de3d-8715-1c34b68c640b
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-da768b0d-e500-f726-164e-2e2379616f19
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-ae036ce5-57c1-a8df-01b2-21cf23bc619b
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-d1ab738b-49ca-ed1f-6700-5336be458e1f
+        - name: NVIDIA A100-SXM4-80GB
+          memory_total: '85899345920'
+          cuda_cores: 6912
+          architecture: Ampere
+          uuid: GPU-cb0ff8e1-e17f-38b9-bc14-4dcc9465b322
+        cuda_version: '12.2'
+        writer_id: c4f1w52emnh3bkfwabjlnv9ozcfaekz0
+visual_gen:
+  desc: null
+  value: true
+visual_und:
+  desc: null
+  value: true
+results_dir:
+  desc: null
+  value: results
+checkpoint_dir:
+  desc: null
+  value: /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
+wandb_project:
+  desc: null
+  value: bagel
+wandb_name:
+  desc: null
+  value: vlm_gym_jigsaw_one_img_lr2e_5_mse_only
+wandb_runid:
+  desc: null
+  value: '0'
+wandb_resume:
+  desc: null
+  value: allow
+wandb_offline:
+  desc: null
+  value: true
+wandb_dir:
+  desc: null
+  value: /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
+global_seed:
+  desc: null
+  value: 4396
+auto_resume:
+  desc: null
+  value: false
+resume_from:
+  desc: null
+  value: /home/clouduser/Code/Models/BAGEL-7B-MoT
+resume_model_only:
+  desc: null
+  value: true
+finetune_from_ema:
+  desc: null
+  value: true
+finetune_from_hf:
+  desc: null
+  value: true
+log_every:
+  desc: null
+  value: 1
+save_every:
+  desc: null
+  value: 2500
+total_steps:
+  desc: null
+  value: 5000
+warmup_steps:
+  desc: null
+  value: 300
+lr_scheduler:
+  desc: null
+  value: cosine
+lr:
+  desc: null
+  value: 2.0e-05
+min_lr:
+  desc: null
+  value: 1.0e-07
+beta1:
+  desc: null
+  value: 0.9
+beta2:
+  desc: null
+  value: 0.95
+eps:
+  desc: null
+  value: 1.0e-15
+ema:
+  desc: null
+  value: 0.993
+max_grad_norm:
+  desc: null
+  value: 1.0
+timestep_shift:
+  desc: null
+  value: 1.0
+mse_weight:
+  desc: null
+  value: 1.0
+ce_weight:
+  desc: null
+  value: 1.0
+ce_loss_reweighting:
+  desc: null
+  value: false
+expected_num_tokens:
+  desc: null
+  value: 20000
+num_replicate:
+  desc: null
+  value: 1
+num_shard:
+  desc: null
+  value: 8
+sharding_strategy:
+  desc: null
+  value: HYBRID_SHARD
+backward_prefetch:
+  desc: null
+  value: BACKWARD_PRE
+cpu_offload:
+  desc: null
+  value: false
+freeze_llm:
+  desc: null
+  value: false
+freeze_vit:
+  desc: null
+  value: false
+freeze_vae:
+  desc: null
+  value: true
+freeze_und:
+  desc: null
+  value: false
+copy_init_moe:
+  desc: null
+  value: true
+use_flex:
+  desc: null
+  value: false
+eval_every:
+  desc: null
+  value: 500
+num_eval_batches:
+  desc: null
+  value: 20
+use_ema_for_eval:
+  desc: null
+  value: true
+viz_every:
+  desc: null
+  value: 10
+viz_n:
+  desc: null
+  value: 8
+viz_outdir:
+  desc: null
+  value: results/viz
+eval_dataset_config_file:
+  desc: null
+  value: ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
+viz_dataset_config_file:
+  desc: null
+  value: ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
+save_ema_only:
+  desc: null
+  value: true
+save_optimizer:
+  desc: null
+  value: false
+model_path:
+  desc: null
+  value: /home/clouduser/Code/Models/BAGEL-7B-MoT
+llm_path:
+  desc: null
+  value: hf/Qwen2.5-0.5B-Instruct/
+llm_qk_norm:
+  desc: null
+  value: true
+tie_word_embeddings:
+  desc: null
+  value: false
+layer_module:
+  desc: null
+  value: Qwen2MoTDecoderLayer
+vae_path:
+  desc: null
+  value: flux/vae/ae.safetensors
+vit_path:
+  desc: null
+  value: hf/siglip-so400m-14-980-flash-attn2-navit/
+max_latent_size:
+  desc: null
+  value: 64
+latent_patch_size:
+  desc: null
+  value: 2
+vit_patch_size:
+  desc: null
+  value: 14
+vit_max_num_patch_per_side:
+  desc: null
+  value: 70
+connector_act:
+  desc: null
+  value: gelu_pytorch_tanh
+interpolate_pos:
+  desc: null
+  value: false
+vit_select_layer:
+  desc: null
+  value: -2
+vit_rope:
+  desc: null
+  value: false
+text_cond_dropout_prob:
+  desc: null
+  value: 0.0
+vae_cond_dropout_prob:
+  desc: null
+  value: 0.0
+vit_cond_dropout_prob:
+  desc: null
+  value: 0.0
+dataset_config_file:
+  desc: null
+  value: ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
+train_data_dir:
+  desc: null
+  value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
+train_jsonl_path:
+  desc: null
+  value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
+eval_data_dir:
+  desc: null
+  value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
+eval_jsonl_path:
+  desc: null
+  value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
+inference_hash_file:
+  desc: null
+  value: /home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json
+prefetch_factor:
+  desc: null
+  value: 2
+num_workers:
+  desc: null
+  value: 1
+max_num_tokens_per_sample:
+  desc: null
+  value: 20000
+max_num_tokens:
+  desc: null
+  value: 20000
+prefer_buffer_before:
+  desc: null
+  value: 16384
+max_buffer_size:
+  desc: null
+  value: 50
+data_seed:
+  desc: null
+  value: 42

	@@ -0,0 +1,601 @@

+FullyShardedDataParallel(
+  (_fsdp_wrapped_module): Bagel(
+    (language_model): Qwen2ForCausalLM(
+      (model): Qwen2Model(
+        (embed_tokens): Embedding(152064, 3584)
+        (layers): ModuleList(
+          (0-27): 28 x FullyShardedDataParallel(
+            (_fsdp_wrapped_module): CheckpointWrapper(
+              (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
+                (self_attn): PackedAttentionMoT(
+                  (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+                  (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+                  (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+                  (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+                  (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
+                  (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
+                  (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
+                  (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
+                  (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
+                  (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
+                  (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
+                  (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
+                )
+                (mlp): Qwen2MLP(
+                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+                  (act_fn): SiLU()
+                )
+                (mlp_moe_gen): Qwen2MLP(
+                  (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
+                  (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
+                  (act_fn): SiLU()
+                )
+                (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
+                (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+                (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
+                (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+              )
+            )
+          )
+        )
+        (norm): Qwen2RMSNorm((3584,), eps=1e-06)
+        (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
+        (rotary_emb): Qwen2RotaryEmbedding()
+      )
+      (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
+    )
+    (time_embedder): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): TimestepEmbedder(
+        (mlp): Sequential(
+          (0): Linear(in_features=256, out_features=3584, bias=True)
+          (1): SiLU()
+          (2): Linear(in_features=3584, out_features=3584, bias=True)
+        )
+      )
+    )
+    (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
+    (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
+    (latent_pos_embed): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): PositionEmbedding()
+    )
+    (vit_model): SiglipVisionModel(
+      (vision_model): FullyShardedDataParallel(
+        (_fsdp_wrapped_module): SiglipVisionTransformer(
+          (embeddings): SiglipVisionEmbeddings(
+            (position_embedding): Embedding(4900, 1152)
+            (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
+          )
+          (encoder): SiglipEncoder(
+            (layers): ModuleList(
+              (0-25): 26 x FullyShardedDataParallel(
+                (_fsdp_wrapped_module): CheckpointWrapper(
+                  (_checkpoint_wrapped_module): SiglipEncoderLayer(
+                    (self_attn): SiglipFlashAttention2(
+                      (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                      (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+                    (mlp): SiglipMLP(
+                      (activation_fn): PytorchGELUTanh()
+                      (fc1): Linear(in_features=1152, out_features=4304, bias=True)
+                      (fc2): Linear(in_features=4304, out_features=1152, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+                  )
+                )
+              )
+            )
+          )
+          (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
+        )
+      )
+    )
+    (connector): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): CheckpointWrapper(
+        (_checkpoint_wrapped_module): MLPconnector(
+          (activation_fn): PytorchGELUTanh()
+          (fc1): Linear(in_features=1152, out_features=3584, bias=True)
+          (fc2): Linear(in_features=3584, out_features=3584, bias=True)
+        )
+      )
+    )
+    (vit_pos_embed): FullyShardedDataParallel(
+      (_fsdp_wrapped_module): PositionEmbedding()
+    )
+  )
+)
+_flat_param True
+language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+time_embedder._fsdp_wrapped_module._flat_param True
+latent_pos_embed._fsdp_wrapped_module._flat_param False
+vit_model.vision_model._fsdp_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
+vit_pos_embed._fsdp_wrapped_module._flat_param False
+Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
+Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
+wandb: Detected [huggingface_hub.inference] in use.
+wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
+wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
+[[34m2026-01-04 09:04:35[39m] Training arguments TrainingArguments(visual_gen=True, visual_und=True, results_dir='results', checkpoint_dir='/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', wandb_project='bagel', wandb_name='vlm_gym_jigsaw_one_img_lr2e_5_mse_only', wandb_runid='0', wandb_resume='allow', wandb_offline=True, wandb_dir='/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', global_seed=4396, auto_resume=False, resume_from='/home/clouduser/Code/Models/BAGEL-7B-MoT', resume_model_only=True, finetune_from_ema=True, finetune_from_hf=True, log_every=1, save_every=2500, total_steps=5000, warmup_steps=300, lr_scheduler='cosine', lr=2e-05, min_lr=1e-07, beta1=0.9, beta2=0.95, eps=1e-15, ema=0.993, max_grad_norm=1.0, timestep_shift=1.0, mse_weight=1.0, ce_weight=1.0, ce_loss_reweighting=False, expected_num_tokens=20000, num_replicate=1, num_shard=8, sharding_strategy='HYBRID_SHARD', backward_prefetch='BACKWARD_PRE', cpu_offload=False, freeze_llm=False, freeze_vit=False, freeze_vae=True, freeze_und=False, copy_init_moe=True, use_flex=False, eval_every=500, num_eval_batches=20, use_ema_for_eval=True, viz_every=10, viz_n=8, viz_outdir='results/viz', eval_dataset_config_file='./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', viz_dataset_config_file='./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', save_ema_only=True, save_optimizer=False)
+[[34m2026-01-04 09:04:35[39m] Model arguments ModelArguments(model_path='/home/clouduser/Code/Models/BAGEL-7B-MoT', llm_path='hf/Qwen2.5-0.5B-Instruct/', llm_qk_norm=True, tie_word_embeddings=False, layer_module='Qwen2MoTDecoderLayer', vae_path='flux/vae/ae.safetensors', vit_path='hf/siglip-so400m-14-980-flash-attn2-navit/', max_latent_size=64, latent_patch_size=2, vit_patch_size=14, vit_max_num_patch_per_side=70, connector_act='gelu_pytorch_tanh', interpolate_pos=False, vit_select_layer=-2, vit_rope=False, text_cond_dropout_prob=0.0, vae_cond_dropout_prob=0.0, vit_cond_dropout_prob=0.0)
+[[34m2026-01-04 09:04:35[39m] Data arguments DataArguments(dataset_config_file='./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', train_data_dir='/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', train_jsonl_path='/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', eval_data_dir='/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', eval_jsonl_path='/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', inference_hash_file='/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', prefetch_factor=2, num_workers=1, max_num_tokens_per_sample=20000, max_num_tokens=20000, prefer_buffer_before=16384, max_buffer_size=50, data_seed=42)
+[[34m2026-01-04 09:09:01[39m] Loading checkpoint from /home/clouduser/Code/Models/BAGEL-7B-MoT.
+[[34m2026-01-04 09:09:12[39m] _IncompatibleKeys(missing_keys=['latent_pos_embed.pos_embed'], unexpected_keys=[])
+[[34m2026-01-04 09:09:29[39m] _IncompatibleKeys(missing_keys=['latent_pos_embed.pos_embed'], unexpected_keys=[])
+[[34m2026-01-04 09:10:03[39m] Training for 5000 steps, starting at 0...
+[[34m2026-01-04 09:10:44[39m] (step=0000000) Train Loss mse: 0.0571, Train Loss ce: 0.0000, Train Steps/Sec: 0.02,
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
+    packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
+    return F.embedding(
+           ^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
+If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
+If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
+    packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
+    return F.embedding(
+           ^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
+If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
+If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
+    packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
+    return F.embedding(
+           ^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
+If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
+If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
+    packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
+                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
+    return F.embedding(
+           ^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
+If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
+If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
+[[34m2026-01-04 09:12:39[39m] (step=0000001) Train Loss mse: 0.0559, Train Loss ce: 0.0000, Train Steps/Sec: 0.01,
+[[34m2026-01-04 09:12:52[39m] (step=0000002) Train Loss mse: 0.0621, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:13:06[39m] (step=0000003) Train Loss mse: 0.0709, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:13:19[39m] (step=0000004) Train Loss mse: 0.0585, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:13:33[39m] (step=0000005) Train Loss mse: 0.0523, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:13:49[39m] (step=0000006) Train Loss mse: 0.0602, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:14:03[39m] (step=0000007) Train Loss mse: 0.0612, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:14:19[39m] (step=0000008) Train Loss mse: 0.0432, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:14:35[39m] (step=0000009) Train Loss mse: 0.0561, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:14:47[39m] (step=0000010) Train Loss mse: 0.0673, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+[[34m2026-01-04 09:15:16[39m] (step=0000011) Train Loss mse: 0.0475, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
+[[34m2026-01-04 09:15:29[39m] (step=0000012) Train Loss mse: 0.0573, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:15:41[39m] (step=0000013) Train Loss mse: 0.0592, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:15:57[39m] (step=0000014) Train Loss mse: 0.0525, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:16:11[39m] (step=0000015) Train Loss mse: 0.0574, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:16:27[39m] (step=0000016) Train Loss mse: 0.0515, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:16:43[39m] (step=0000017) Train Loss mse: 0.0759, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:16:56[39m] (step=0000018) Train Loss mse: 0.0802, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:17:12[39m] (step=0000019) Train Loss mse: 0.0643, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:17:28[39m] (step=0000020) Train Loss mse: 0.0476, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+[[34m2026-01-04 09:17:58[39m] (step=0000021) Train Loss mse: 0.0642, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
+[[34m2026-01-04 09:18:11[39m] (step=0000022) Train Loss mse: 0.0536, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:18:27[39m] (step=0000023) Train Loss mse: 0.0590, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:18:40[39m] (step=0000024) Train Loss mse: 0.0534, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:18:56[39m] (step=0000025) Train Loss mse: 0.0469, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:19:09[39m] (step=0000026) Train Loss mse: 0.0495, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:19:25[39m] (step=0000027) Train Loss mse: 0.0638, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:19:38[39m] (step=0000028) Train Loss mse: 0.0685, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:19:52[39m] (step=0000029) Train Loss mse: 0.0469, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:20:08[39m] (step=0000030) Train Loss mse: 0.0546, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+[[34m2026-01-04 09:20:40[39m] (step=0000031) Train Loss mse: 0.0437, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
+[[34m2026-01-04 09:20:53[39m] (step=0000032) Train Loss mse: 0.0544, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:21:09[39m] (step=0000033) Train Loss mse: 0.0477, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:21:26[39m] (step=0000034) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:21:39[39m] (step=0000035) Train Loss mse: 0.0571, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+[[34m2026-01-04 09:21:55[39m] (step=0000036) Train Loss mse: 0.0632, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:22:09[39m] (step=0000037) Train Loss mse: 0.0479, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
+[[34m2026-01-04 09:22:25[39m] (step=0000038) Train Loss mse: 0.0481, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:22:41[39m] (step=0000039) Train Loss mse: 0.0573, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
+[[34m2026-01-04 09:22:53[39m] (step=0000040) Train Loss mse: 0.0544, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
+    sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
+                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
+    for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
+                                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+TypeError: 'NoneType' object is not iterable
+Traceback (most recent call last):
+  File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
+    outputs = model(
+              ^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
+    return self._call_impl(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
+    return forward_call(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

	@@ -0,0 +1 @@

+ {"os": "Linux-6.6.93+-x86_64-with-glibc2.35", "python": "CPython 3.11.10", "started_at": "2026-01-04T09:04:29.298919Z", "args": ["--dataset_config_file", "./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml", "--eval_dataset_config_file", "./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml", "--viz_dataset_config_file", "./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml", "--inference_hash_file", "/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json", "--train_data_dir", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/", "--train_jsonl_path", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/", "--eval_data_dir", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/", "--eval_jsonl_path", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/", "--model_path", "/home/clouduser/Code/Models/BAGEL-7B-MoT", "--layer_module", "Qwen2MoTDecoderLayer", "--max_latent_size", "64", "--resume-from", "/home/clouduser/Code/Models/BAGEL-7B-MoT", "--finetune_from_hf", "True", "--auto_resume", "False", "--resume-model-only", "True", "--finetune-from-ema", "True", "--log_every", "1", "--lr", "2e-5", "--warmup_steps", "300", "--lr_scheduler", "cosine", "--num_worker", "1", "--expected_num_tokens", "20000", "--max_num_tokens", "20000", "--max_num_tokens_per_sample", "20000", "--visual_und", "True", "--save_every", "2500", "--total_steps", "5000", "--text_cond_dropout_prob", "0.0", "--vae_cond_dropout_prob", "0.0", "--vit_cond_dropout_prob", "0.0", "--checkpoint_dir", "/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test", "--wandb_project", "bagel", "--wandb_name", "vlm_gym_jigsaw_one_img_lr2e_5_mse_only", "--wandb_dir", "/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test", "--wandb_offline", "True", "--viz_every", "10"], "program": "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", "code_path": "train/pretrain_unified_navit.py", "code_path_local": "train/pretrain_unified_navit.py", "git": {"remote_url": "https://github.com/para-lost/unified_world_model", "commit": "be2c19982b710041da81a85f55c2877ea0e2e2c6"}, "root": "/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test", "host": "junyizhang-launch-new-219635706-1-0", "executable": "/opt/conda/bin/python3.11", "cpu_count": 48, "cpu_count_logical": 96, "gpu_type": "NVIDIA A100-SXM4-80GB", "gpu_count": 8, "disk": {"/": {"total": "1052461830144", "used": "261623291904"}}, "memory": {"total": "1437332611072"}, "gpu_nvidia": [{"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-71d2c6f0-c9e9-2110-f69b-f7fd558363b2"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-a25620fe-6924-8936-d5a8-9dfb1c7177e8"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-2b69635d-5f31-ec83-bcde-b1df07b60307"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-4469aac3-d575-de3d-8715-1c34b68c640b"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-da768b0d-e500-f726-164e-2e2379616f19"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-ae036ce5-57c1-a8df-01b2-21cf23bc619b"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-d1ab738b-49ca-ed1f-6700-5336be458e1f"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-cb0ff8e1-e17f-38b9-bc14-4dcc9465b322"}], "cuda_version": "12.2", "writer_id": "c4f1w52emnh3bkfwabjlnv9ozcfaekz0"}

	@@ -0,0 +1 @@


1	+ {"_runtime": 1103.04216952, "total_samples": 11, "mem_cache": 77354, "_timestamp": 1767518561.3092997, "eval/ce": 0, "mse": 0.05725831538438797, "ce": 0, "mem_allocated": 51190.20703125, "eval/mse": 0.06041467562317848, "_step": 39, "lr": 2.666666666666667e-06, "total_mse_tokens": 51200, "total_ce_tokens": 0, "total_norm": 0.22510729730129242}

@@ -4,3 +4,4 @@
 {"time":"2026-01-04T09:04:29.745202354Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
 {"time":"2026-01-04T09:04:29.757743673Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
 {"time":"2026-01-04T09:04:30.242343786Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}

 {"time":"2026-01-04T09:04:29.745202354Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
 {"time":"2026-01-04T09:04:29.757743673Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
 {"time":"2026-01-04T09:04:30.242343786Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
+{"time":"2026-01-04T09:28:49.06895492Z","level":"INFO","msg":"server: parent process exited, terminating service process"}

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9db8970c306c984926cadb3d1739cc2ad4bd53712ed9b9744f2f84d7b4e35ca
-size 393216

 version https://git-lfs.github.com/spec/v1
+oid sha256:9140cc820e3e9fffcb43c8f00e85dd36eaa4870b83cc86639f3359dbdfa6f7fe
+size 491520

checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,354 @@

+Brotli==1.1.0
+MarkupSafe==3.0.2
+PySocks==1.7.1
+PyYAML==6.0.2
+archspec==0.2.3
+asttokens==2.4.1
+astunparse==1.6.3
+attrs==24.2.0
+beautifulsoup4==4.12.3
+boltons==24.0.0
+certifi==2024.8.30
+chardet==5.2.0
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+conda==24.9.2
+conda-build==24.9.0
+conda_index==0.5.0
+conda-libmamba-solver==24.9.0
+conda-package-handling==2.4.0
+conda_package_streaming==0.11.0
+decorator==5.1.1
+distro==1.9.0
+dnspython==2.7.0
+exceptiongroup==1.2.2
+executing==2.1.0
+expecttest==0.2.1
+filelock==3.16.1
+frozendict==2.4.6
+fsspec==2024.10.0
+h2==4.1.0
+hpack==4.0.0
+hyperframe==6.0.1
+hypothesis==6.115.5
+idna==3.10
+importlib_resources==6.4.5
+ipython==8.29.0
+jedi==0.19.1
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+libarchive-c==5.1
+libmambapy==1.5.10
+lief==0.14.1
+lintrunner==0.12.5
+mamba==1.5.10
+matplotlib-inline==0.1.7
+menuinst==2.1.2
+more-itertools==10.5.0
+mpmath==1.3.0
+networkx==3.4.2
+ninja==1.11.1.1
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+optree==0.13.0
+parso==0.8.4
+pexpect==4.9.0
+pickleshare==0.7.5
+pillow==10.2.0
+pkginfo==1.11.2
+pkgutil_resolve_name==1.3.10
+platformdirs==4.3.6
+pluggy==1.5.0
+prompt_toolkit==3.0.48
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycosat==0.6.6
+pycparser==2.22
+Pygments==2.18.0
+python-etcd==0.4.5
+pytz==2024.2
+referencing==0.35.1
+requests==2.32.3
+rpds-py==0.20.0
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+six==1.16.0
+sortedcontainers==2.4.0
+soupsieve==2.5
+stack-data==0.6.2
+sympy==1.13.1
+torchaudio==2.5.1+cu124
+torchelastic==0.2.2
+tqdm==4.66.5
+traitlets==5.14.3
+triton==3.1.0
+truststore==0.9.2
+types-dataclasses==0.6.6
+urllib3==2.2.3
+wcwidth==0.2.13
+zipp==3.20.2
+zstandard==0.23.0
+numpy==1.24.4
+imgcat==0.6.0
+decord==0.6.0
+flash_attn==2.5.8
+contourpy==1.3.2
+cycler==0.12.1
+fonttools==4.61.1
+huggingface-hub==0.29.1
+kiwisolver==1.4.9
+matplotlib==3.7.0
+opencv-python==4.7.0.72
+pyarrow==11.0.0
+pyparsing==3.2.5
+safetensors==0.4.5
+scipy==1.10.1
+sentencepiece==0.1.99
+torch==2.5.1
+torchvision==0.20.1
+transformers==4.49.0
+pip==25.3
+setuptools==80.9.0
+wheel==0.45.1
+Pebble==5.1.3
+accelerate==1.12.0
+addftool==0.2.13
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiohttp-cors==0.8.1
+aiosignal==1.4.0
+airportsdata==20250909
+annotated-doc==0.0.4
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+bcrypt==5.0.0
+blobfile==3.0.0
+cffi==2.0.0
+cloudpickle==3.1.2
+codetiming==1.4.0
+colorful==0.5.8
+compressed-tensors==0.12.2
+cryptography==46.0.3
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.3
+cuda-python==13.1.1
+datasets==4.4.1
+Deprecated==1.3.1
+diskcache==5.6.3
+distlib==0.4.0
+docstring_parser==0.17.0
+easydict==1.13
+fabric==3.2.2
+fastapi==0.124.4
+fire==0.7.1
+flashinfer-python==0.2.5
+frozenlist==1.8.0
+gevent==25.9.1
+gitdb==4.0.12
+GitPython==3.1.45
+google-api-core==2.28.1
+google-auth==2.43.0
+google-cloud-aiplatform==1.130.0
+google-cloud-bigquery==3.38.0
+google-cloud-core==2.5.0
+google-cloud-resource-manager==1.15.0
+google-cloud-storage==3.7.0
+google-crc32c==1.7.1
+google-genai==1.55.0
+google-resumable-media==2.8.0
+googleapis-common-protos==1.72.0
+greenlet==3.3.0
+grpc-google-iam-v1==0.14.3
+grpcio==1.76.0
+grpcio-status==1.76.0
+hf_transfer==0.1.9
+hf-xet==1.2.0
+hydra-core==1.3.2
+importlib_metadata==8.7.0
+interegular==0.3.3
+invoke==2.2.1
+jiter==0.12.0
+joblib==1.5.2
+jsonlines==4.0.0
+lark==1.3.1
+latex2sympy2==1.5.4
+latex2sympy2_extended==1.10.2
+libtmux==0.52.1
+llguidance==0.7.30
+loguru==0.7.3
+lxml==6.0.2
+math-verify==0.8.0
+modelscope==1.33.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.0
+multiprocess==0.70.18
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py==13.590.44
+omegaconf==2.3.0
+openai==2.11.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+opentelemetry-api==1.39.1
+opentelemetry-exporter-prometheus==0.60b1
+opentelemetry-proto==1.39.1
+opentelemetry-sdk==1.39.1
+opentelemetry-semantic-conventions==0.60b1
+orjson==3.11.5
+outlines==0.1.11
+outlines_core==0.1.26
+packaging==25.0
+pandas==2.3.3
+parallel-ssh==2.16.0.post1
+paramiko==4.0.0
+partial-json-parser==0.2.1.1.post7
+peft==0.18.0
+propcache==0.4.1
+proto-plus==1.26.1
+protobuf==6.33.2
+py-spy==0.4.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pybind11==3.0.1
+pycountry==24.6.1
+pycryptodomex==3.23.0
+pydantic==2.12.5
+pydantic_core==2.41.5
+pylatexenc==2.10
+PyNaCl==1.6.1
+pynvml==13.0.1
+python-multipart==0.0.20
+ray==2.52.1
+regex==2025.11.3
+rsa==4.9.1
+scikit-learn==1.8.0
+sentence-transformers==5.2.0
+sentry-sdk==2.47.0
+setproctitle==1.3.7
+sgl-kernel==0.1.4
+sglang==0.4.6.post5
+shapely==2.1.2
+smart_open==7.5.0
+smmap==5.0.2
+sniffio==1.3.1
+soundfile==0.13.1
+ssh2-python==1.2.0.post1
+ssh-python==1.2.0.post1
+starlette==0.50.0
+tabulate==0.9.0
+tenacity==9.1.2
+tensorboardX==2.6.4
+tensordict==0.6.2
+termcolor==3.2.0
+threadpoolctl==3.6.0
+tiktoken==0.12.0
+timeout-decorator==0.5.0
+tmuxp==1.61.0
+tokenizers==0.21.4
+torch_memory_saver==0.0.9
+torchao==0.9.0
+torchdata==0.11.0
+typing-inspection==0.4.2
+uvicorn==0.38.0
+uvloop==0.22.1
+virtualenv==20.35.4
+wandb==0.23.1
+websockets==15.0.1
+word2number==1.1
+wrapt==2.0.1
+xgrammar==0.1.19
+xxhash==3.6.0
+yarl==1.22.0
+zope.event==6.1
+zope.interface==8.1.1
+cachetools==6.2.3
+dill==0.4.0
+inflect==7.5.0
+lazy_loader==0.4
+rp==0.1.1333
+stackprinter==0.2.12
+typeguard==4.4.4
+typing_extensions==4.15.0
+asciinema==2.4.0
+einops==0.8.1
+Send2Trash==1.8.3
+anyio==4.12.0
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.4.0
+async-lru==2.0.5
+babel==2.17.0
+bleach==6.3.0
+comm==0.2.3
+debugpy==1.8.18
+defusedxml==0.7.1
+fastjsonschema==2.21.2
+fqdn==1.5.1
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+ipykernel==7.1.0
+isoduration==20.11.0
+json5==0.12.1
+jupyter_client==8.7.0
+jupyter_core==5.9.1
+jupyter-events==0.12.0
+jupyter-lsp==2.3.0
+jupyter_server==2.17.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.5.0
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.28.0
+mistune==3.1.4
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+pandocfilters==1.5.1
+prometheus_client==0.23.1
+python-dateutil==2.9.0.post0
+python-json-logger==4.0.0
+pyzmq==27.1.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+terminado==0.18.1
+tinycss2==1.4.0
+tornado==6.5.3
+tzdata==2025.3
+uri-template==1.3.0
+webcolors==25.10.0
+webencodings==0.5.1
+websocket-client==1.9.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.2
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.45.1
+zipp==3.19.2

	@@ -0,0 +1,7 @@

+{"time":"2026-01-04T09:32:17.157035079Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmphkc9n1om/port-20932.txt","pid":20932,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-01-04T09:32:17.158915538Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":20932}
+{"time":"2026-01-04T09:32:17.158917928Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-20932-21162-3167463651/socket","Net":"unix"}}
+{"time":"2026-01-04T09:32:17.342986695Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-01-04T09:32:17.351333861Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
+{"time":"2026-01-04T09:32:17.578454587Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
+{"time":"2026-01-04T09:39:28.832808185Z","level":"INFO","msg":"server: parent process exited, terminating service process"}

	@@ -0,0 +1,8 @@

+{"time":"2026-01-04T09:32:17.359914174Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-01-04T09:32:17.578184558Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
+{"time":"2026-01-04T09:32:17.578260453Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:32:17.578444715Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:32:17.578428925Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:32:17.578480061Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:32:17.578493724Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:32:17.579316636Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}

	@@ -0,0 +1,24 @@

+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_setup.py:_flush():80] Configure stats pid to 20932
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
+2026-01-04 09:32:16,949 INFO    MainThread:20932 [wandb_init.py:init():841] calling init triggers
+2026-01-04 09:32:16,950 INFO    MainThread:20932 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-01-04 09:32:16,950 INFO    MainThread:20932 [wandb_init.py:init():889] starting backend
+2026-01-04 09:32:17,342 INFO    MainThread:20932 [wandb_init.py:init():892] sending inform_init request
+2026-01-04 09:32:17,349 INFO    MainThread:20932 [wandb_init.py:init():900] backend started and connected
+2026-01-04 09:32:17,351 INFO    MainThread:20932 [wandb_init.py:init():970] updated telemetry
+2026-01-04 09:32:17,359 INFO    MainThread:20932 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-01-04 09:32:17,581 INFO    MainThread:20932 [wandb_init.py:init():1041] starting run threads in backend
+2026-01-04 09:32:17,942 INFO    MainThread:20932 [wandb_run.py:_console_start():2521] atexit reg
+2026-01-04 09:32:17,942 INFO    MainThread:20932 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-01-04 09:32:17,942 INFO    MainThread:20932 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-01-04 09:32:17,942 INFO    MainThread:20932 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-01-04 09:32:17,945 INFO    MainThread:20932 [wandb_init.py:init():1081] run started, returning control to user process
+2026-01-04 09:32:17,947 INFO    MainThread:20932 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
+2026-01-04 09:32:17,948 INFO    MainThread:20932 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
+2026-01-04 09:32:17,949 INFO    MainThread:20932 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7cb17e00f6acbbd7e6d5a4de0ff7353f427c0b6a094cae28f28c4fdb6148f7e
+size 131072

checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt ADDED Viewed

	@@ -0,0 +1,354 @@

+Brotli==1.1.0
+MarkupSafe==3.0.2
+PySocks==1.7.1
+PyYAML==6.0.2
+archspec==0.2.3
+asttokens==2.4.1
+astunparse==1.6.3
+attrs==24.2.0
+beautifulsoup4==4.12.3
+boltons==24.0.0
+certifi==2024.8.30
+chardet==5.2.0
+charset-normalizer==3.4.0
+click==8.1.7
+colorama==0.4.6
+conda==24.9.2
+conda-build==24.9.0
+conda_index==0.5.0
+conda-libmamba-solver==24.9.0
+conda-package-handling==2.4.0
+conda_package_streaming==0.11.0
+decorator==5.1.1
+distro==1.9.0
+dnspython==2.7.0
+exceptiongroup==1.2.2
+executing==2.1.0
+expecttest==0.2.1
+filelock==3.16.1
+frozendict==2.4.6
+fsspec==2024.10.0
+h2==4.1.0
+hpack==4.0.0
+hyperframe==6.0.1
+hypothesis==6.115.5
+idna==3.10
+importlib_resources==6.4.5
+ipython==8.29.0
+jedi==0.19.1
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+libarchive-c==5.1
+libmambapy==1.5.10
+lief==0.14.1
+lintrunner==0.12.5
+mamba==1.5.10
+matplotlib-inline==0.1.7
+menuinst==2.1.2
+more-itertools==10.5.0
+mpmath==1.3.0
+networkx==3.4.2
+ninja==1.11.1.1
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+optree==0.13.0
+parso==0.8.4
+pexpect==4.9.0
+pickleshare==0.7.5
+pillow==10.2.0
+pkginfo==1.11.2
+pkgutil_resolve_name==1.3.10
+platformdirs==4.3.6
+pluggy==1.5.0
+prompt_toolkit==3.0.48
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycosat==0.6.6
+pycparser==2.22
+Pygments==2.18.0
+python-etcd==0.4.5
+pytz==2024.2
+referencing==0.35.1
+requests==2.32.3
+rpds-py==0.20.0
+ruamel.yaml==0.18.6
+ruamel.yaml.clib==0.2.8
+six==1.16.0
+sortedcontainers==2.4.0
+soupsieve==2.5
+stack-data==0.6.2
+sympy==1.13.1
+torchaudio==2.5.1+cu124
+torchelastic==0.2.2
+tqdm==4.66.5
+traitlets==5.14.3
+triton==3.1.0
+truststore==0.9.2
+types-dataclasses==0.6.6
+urllib3==2.2.3
+wcwidth==0.2.13
+zipp==3.20.2
+zstandard==0.23.0
+numpy==1.24.4
+imgcat==0.6.0
+decord==0.6.0
+flash_attn==2.5.8
+contourpy==1.3.2
+cycler==0.12.1
+fonttools==4.61.1
+huggingface-hub==0.29.1
+kiwisolver==1.4.9
+matplotlib==3.7.0
+opencv-python==4.7.0.72
+pyarrow==11.0.0
+pyparsing==3.2.5
+safetensors==0.4.5
+scipy==1.10.1
+sentencepiece==0.1.99
+torch==2.5.1
+torchvision==0.20.1
+transformers==4.49.0
+pip==25.3
+setuptools==80.9.0
+wheel==0.45.1
+Pebble==5.1.3
+accelerate==1.12.0
+addftool==0.2.13
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiohttp-cors==0.8.1
+aiosignal==1.4.0
+airportsdata==20250909
+annotated-doc==0.0.4
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+bcrypt==5.0.0
+blobfile==3.0.0
+cffi==2.0.0
+cloudpickle==3.1.2
+codetiming==1.4.0
+colorful==0.5.8
+compressed-tensors==0.12.2
+cryptography==46.0.3
+cuda-bindings==13.1.1
+cuda-pathfinder==1.3.3
+cuda-python==13.1.1
+datasets==4.4.1
+Deprecated==1.3.1
+diskcache==5.6.3
+distlib==0.4.0
+docstring_parser==0.17.0
+easydict==1.13
+fabric==3.2.2
+fastapi==0.124.4
+fire==0.7.1
+flashinfer-python==0.2.5
+frozenlist==1.8.0
+gevent==25.9.1
+gitdb==4.0.12
+GitPython==3.1.45
+google-api-core==2.28.1
+google-auth==2.43.0
+google-cloud-aiplatform==1.130.0
+google-cloud-bigquery==3.38.0
+google-cloud-core==2.5.0
+google-cloud-resource-manager==1.15.0
+google-cloud-storage==3.7.0
+google-crc32c==1.7.1
+google-genai==1.55.0
+google-resumable-media==2.8.0
+googleapis-common-protos==1.72.0
+greenlet==3.3.0
+grpc-google-iam-v1==0.14.3
+grpcio==1.76.0
+grpcio-status==1.76.0
+hf_transfer==0.1.9
+hf-xet==1.2.0
+hydra-core==1.3.2
+importlib_metadata==8.7.0
+interegular==0.3.3
+invoke==2.2.1
+jiter==0.12.0
+joblib==1.5.2
+jsonlines==4.0.0
+lark==1.3.1
+latex2sympy2==1.5.4
+latex2sympy2_extended==1.10.2
+libtmux==0.52.1
+llguidance==0.7.30
+loguru==0.7.3
+lxml==6.0.2
+math-verify==0.8.0
+modelscope==1.33.0
+msgpack==1.1.2
+msgspec==0.20.0
+multidict==6.7.0
+multiprocess==0.70.18
+nvidia-cusparselt-cu12==0.6.2
+nvidia-ml-py==13.590.44
+omegaconf==2.3.0
+openai==2.11.0
+opencensus==0.11.4
+opencensus-context==0.1.3
+opentelemetry-api==1.39.1
+opentelemetry-exporter-prometheus==0.60b1
+opentelemetry-proto==1.39.1
+opentelemetry-sdk==1.39.1
+opentelemetry-semantic-conventions==0.60b1
+orjson==3.11.5
+outlines==0.1.11
+outlines_core==0.1.26
+packaging==25.0
+pandas==2.3.3
+parallel-ssh==2.16.0.post1
+paramiko==4.0.0
+partial-json-parser==0.2.1.1.post7
+peft==0.18.0
+propcache==0.4.1
+proto-plus==1.26.1
+protobuf==6.33.2
+py-spy==0.4.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pybind11==3.0.1
+pycountry==24.6.1
+pycryptodomex==3.23.0
+pydantic==2.12.5
+pydantic_core==2.41.5
+pylatexenc==2.10
+PyNaCl==1.6.1
+pynvml==13.0.1
+python-multipart==0.0.20
+ray==2.52.1
+regex==2025.11.3
+rsa==4.9.1
+scikit-learn==1.8.0
+sentence-transformers==5.2.0
+sentry-sdk==2.47.0
+setproctitle==1.3.7
+sgl-kernel==0.1.4
+sglang==0.4.6.post5
+shapely==2.1.2
+smart_open==7.5.0
+smmap==5.0.2
+sniffio==1.3.1
+soundfile==0.13.1
+ssh2-python==1.2.0.post1
+ssh-python==1.2.0.post1
+starlette==0.50.0
+tabulate==0.9.0
+tenacity==9.1.2
+tensorboardX==2.6.4
+tensordict==0.6.2
+termcolor==3.2.0
+threadpoolctl==3.6.0
+tiktoken==0.12.0
+timeout-decorator==0.5.0
+tmuxp==1.61.0
+tokenizers==0.21.4
+torch_memory_saver==0.0.9
+torchao==0.9.0
+torchdata==0.11.0
+typing-inspection==0.4.2
+uvicorn==0.38.0
+uvloop==0.22.1
+virtualenv==20.35.4
+wandb==0.23.1
+websockets==15.0.1
+word2number==1.1
+wrapt==2.0.1
+xgrammar==0.1.19
+xxhash==3.6.0
+yarl==1.22.0
+zope.event==6.1
+zope.interface==8.1.1
+cachetools==6.2.3
+dill==0.4.0
+inflect==7.5.0
+lazy_loader==0.4
+rp==0.1.1333
+stackprinter==0.2.12
+typeguard==4.4.4
+typing_extensions==4.15.0
+asciinema==2.4.0
+einops==0.8.1
+Send2Trash==1.8.3
+anyio==4.12.0
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.4.0
+async-lru==2.0.5
+babel==2.17.0
+bleach==6.3.0
+comm==0.2.3
+debugpy==1.8.18
+defusedxml==0.7.1
+fastjsonschema==2.21.2
+fqdn==1.5.1
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+ipykernel==7.1.0
+isoduration==20.11.0
+json5==0.12.1
+jupyter_client==8.7.0
+jupyter_core==5.9.1
+jupyter-events==0.12.0
+jupyter-lsp==2.3.0
+jupyter_server==2.17.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.5.0
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.28.0
+mistune==3.1.4
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook_shim==0.2.4
+overrides==7.7.0
+pandocfilters==1.5.1
+prometheus_client==0.23.1
+python-dateutil==2.9.0.post0
+python-json-logger==4.0.0
+pyzmq==27.1.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+terminado==0.18.1
+tinycss2==1.4.0
+tornado==6.5.3
+tzdata==2025.3
+uri-template==1.3.0
+webcolors==25.10.0
+webencodings==0.5.1
+websocket-client==1.9.0
+autocommand==2.2.2
+backports.tarfile==1.2.0
+importlib_metadata==8.0.0
+inflect==7.3.1
+jaraco.collections==5.1.0
+jaraco.context==5.3.0
+jaraco.functools==4.0.1
+jaraco.text==3.12.1
+more-itertools==10.3.0
+packaging==24.2
+platformdirs==4.2.2
+tomli==2.0.1
+typeguard==4.3.0
+typing_extensions==4.12.2
+wheel==0.45.1
+zipp==3.19.2

	@@ -0,0 +1,7 @@

+{"time":"2026-01-04T09:41:58.611687494Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpml8qx73j/port-49730.txt","pid":49730,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2026-01-04T09:41:58.613969345Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":49730}
+{"time":"2026-01-04T09:41:58.613955135Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-49730-49896-3226128572/socket","Net":"unix"}}
+{"time":"2026-01-04T09:41:58.796729122Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
+{"time":"2026-01-04T09:41:58.807276343Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
+{"time":"2026-01-04T09:41:59.539163963Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
+{"time":"2026-01-04T09:48:17.195652917Z","level":"INFO","msg":"server: parent process exited, terminating service process"}

	@@ -0,0 +1,8 @@

+{"time":"2026-01-04T09:41:58.817034119Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-01-04T09:41:59.538864599Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
+{"time":"2026-01-04T09:41:59.538957051Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539033043Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539155721Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539184904Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539218617Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
+{"time":"2026-01-04T09:41:59.539987313Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}

	@@ -0,0 +1,24 @@

+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Configure stats pid to 49730
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-01-04 09:41:58,406 INFO    MainThread:49730 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:init():841] calling init triggers
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2026-01-04 09:41:58,407 INFO    MainThread:49730 [wandb_init.py:init():889] starting backend
+2026-01-04 09:41:58,797 INFO    MainThread:49730 [wandb_init.py:init():892] sending inform_init request
+2026-01-04 09:41:58,805 INFO    MainThread:49730 [wandb_init.py:init():900] backend started and connected
+2026-01-04 09:41:58,807 INFO    MainThread:49730 [wandb_init.py:init():970] updated telemetry
+2026-01-04 09:41:58,816 INFO    MainThread:49730 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-01-04 09:41:59,542 INFO    MainThread:49730 [wandb_init.py:init():1041] starting run threads in backend
+2026-01-04 09:41:59,917 INFO    MainThread:49730 [wandb_run.py:_console_start():2521] atexit reg
+2026-01-04 09:41:59,918 INFO    MainThread:49730 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-01-04 09:41:59,918 INFO    MainThread:49730 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-01-04 09:41:59,918 INFO    MainThread:49730 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-01-04 09:41:59,921 INFO    MainThread:49730 [wandb_init.py:init():1081] run started, returning control to user process
+2026-01-04 09:41:59,922 INFO    MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
+2026-01-04 09:41:59,923 INFO    MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
+2026-01-04 09:41:59,924 INFO    MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fea8db2c722c608b691a06b3e692272a70eb2fb6d6fdaddefd3d9b5b178335ba
+size 131072