Junyi42 commited on
Commit
802680b
·
verified ·
1 Parent(s): 2f3bbdc

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test

Browse files
Files changed (21) hide show
  1. .gitattributes +2 -0
  2. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug-internal.log +8 -8
  3. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug.log +23 -23
  4. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml +1 -0
  5. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log +174 -200
  6. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml +437 -0
  7. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log +601 -0
  8. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/wandb-metadata.json +1 -0
  9. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/wandb-summary.json +1 -0
  10. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +1 -0
  11. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +2 -2
  12. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt +354 -0
  13. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +7 -0
  14. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log +8 -0
  15. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log +24 -0
  16. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +3 -0
  17. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt +354 -0
  18. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +7 -0
  19. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log +8 -0
  20. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log +24 -0
  21. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +3 -0
.gitattributes CHANGED
@@ -210,3 +210,5 @@ checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_ji
210
  checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20260104_091756-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
211
  checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
212
  checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/wandb/offline-run-20260104_093254-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
210
  checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse/wandb/offline-run-20260104_091756-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_no_mse_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
211
  checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
212
  checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce/wandb/offline-run-20260104_093254-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0/run-checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_ce_ema993_hashed-run0.wandb filter=lfs diff=lfs merge=lfs -text
213
+ checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
214
+ checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug-internal.log CHANGED
@@ -1,8 +1,8 @@
1
- {"time":"2026-01-04T09:04:29.920066674Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
- {"time":"2026-01-04T09:04:30.241997449Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
- {"time":"2026-01-04T09:04:30.242081506Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
- {"time":"2026-01-04T09:04:30.242332296Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
- {"time":"2026-01-04T09:04:30.242363843Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
- {"time":"2026-01-04T09:04:30.242420429Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
- {"time":"2026-01-04T09:04:30.242455Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
- {"time":"2026-01-04T09:04:30.243628157Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
 
1
+ {"time":"2026-01-04T09:41:58.817034119Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2026-01-04T09:41:59.538864599Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-01-04T09:41:59.538957051Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
+ {"time":"2026-01-04T09:41:59.539033043Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
+ {"time":"2026-01-04T09:41:59.539155721Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
+ {"time":"2026-01-04T09:41:59.539184904Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
+ {"time":"2026-01-04T09:41:59.539218617Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
+ {"time":"2026-01-04T09:41:59.539987313Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/debug.log CHANGED
@@ -1,24 +1,24 @@
1
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_setup.py:_flush():80] Configure stats pid to 13230
3
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_init.py:init():841] calling init triggers
9
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
  config: {'_wandb': {}}
11
- 2026-01-04 09:04:29,300 INFO MainThread:13230 [wandb_init.py:init():889] starting backend
12
- 2026-01-04 09:04:29,745 INFO MainThread:13230 [wandb_init.py:init():892] sending inform_init request
13
- 2026-01-04 09:04:29,753 INFO MainThread:13230 [wandb_init.py:init():900] backend started and connected
14
- 2026-01-04 09:04:29,755 INFO MainThread:13230 [wandb_init.py:init():970] updated telemetry
15
- 2026-01-04 09:04:29,763 INFO MainThread:13230 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
- 2026-01-04 09:04:30,245 INFO MainThread:13230 [wandb_init.py:init():1041] starting run threads in backend
17
- 2026-01-04 09:04:30,693 INFO MainThread:13230 [wandb_run.py:_console_start():2521] atexit reg
18
- 2026-01-04 09:04:30,693 INFO MainThread:13230 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
- 2026-01-04 09:04:30,693 INFO MainThread:13230 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
- 2026-01-04 09:04:30,693 INFO MainThread:13230 [wandb_run.py:_redirect():2461] Redirects installed.
21
- 2026-01-04 09:04:30,697 INFO MainThread:13230 [wandb_init.py:init():1081] run started, returning control to user process
22
- 2026-01-04 09:04:30,699 INFO MainThread:13230 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
23
- 2026-01-04 09:04:30,700 INFO MainThread:13230 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
- 2026-01-04 09:04:30,700 INFO MainThread:13230 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
 
1
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Configure stats pid to 49730
3
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:init():841] calling init triggers
9
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
  config: {'_wandb': {}}
11
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:init():889] starting backend
12
+ 2026-01-04 09:41:58,797 INFO MainThread:49730 [wandb_init.py:init():892] sending inform_init request
13
+ 2026-01-04 09:41:58,805 INFO MainThread:49730 [wandb_init.py:init():900] backend started and connected
14
+ 2026-01-04 09:41:58,807 INFO MainThread:49730 [wandb_init.py:init():970] updated telemetry
15
+ 2026-01-04 09:41:58,816 INFO MainThread:49730 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2026-01-04 09:41:59,542 INFO MainThread:49730 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2026-01-04 09:41:59,917 INFO MainThread:49730 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2026-01-04 09:41:59,918 INFO MainThread:49730 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2026-01-04 09:41:59,918 INFO MainThread:49730 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2026-01-04 09:41:59,918 INFO MainThread:49730 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2026-01-04 09:41:59,921 INFO MainThread:49730 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2026-01-04 09:41:59,922 INFO MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
23
+ 2026-01-04 09:41:59,923 INFO MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
+ 2026-01-04 09:41:59,924 INFO MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml CHANGED
@@ -34,6 +34,7 @@ _wandb:
34
  - 4
35
  - 13
36
  - 14
 
37
  - 42
38
  - 61
39
  4: 3.11.10
 
34
  - 4
35
  - 13
36
  - 14
37
+ - 37
38
  - 42
39
  - 61
40
  4: 3.11.10
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260103_081257-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log CHANGED
@@ -1,176 +1,3 @@
1
- FullyShardedDataParallel(
2
- (_fsdp_wrapped_module): Bagel(
3
- (language_model): Qwen2ForCausalLM(
4
- (model): Qwen2Model(
5
- (embed_tokens): Embedding(152064, 3584)
6
- (layers): ModuleList(
7
- (0-27): 28 x FullyShardedDataParallel(
8
- (_fsdp_wrapped_module): CheckpointWrapper(
9
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
- (self_attn): PackedAttentionMoT(
11
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
- )
24
- (mlp): Qwen2MLP(
25
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
- (act_fn): SiLU()
29
- )
30
- (mlp_moe_gen): Qwen2MLP(
31
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
- (act_fn): SiLU()
35
- )
36
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
- )
41
- )
42
- )
43
- )
44
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (rotary_emb): Qwen2RotaryEmbedding()
47
- )
48
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
- )
50
- (time_embedder): FullyShardedDataParallel(
51
- (_fsdp_wrapped_module): TimestepEmbedder(
52
- (mlp): Sequential(
53
- (0): Linear(in_features=256, out_features=3584, bias=True)
54
- (1): SiLU()
55
- (2): Linear(in_features=3584, out_features=3584, bias=True)
56
- )
57
- )
58
- )
59
- (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
60
- (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
61
- (latent_pos_embed): FullyShardedDataParallel(
62
- (_fsdp_wrapped_module): PositionEmbedding()
63
- )
64
- (vit_model): SiglipVisionModel(
65
- (vision_model): FullyShardedDataParallel(
66
- (_fsdp_wrapped_module): SiglipVisionTransformer(
67
- (embeddings): SiglipVisionEmbeddings(
68
- (position_embedding): Embedding(4900, 1152)
69
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
70
- )
71
- (encoder): SiglipEncoder(
72
- (layers): ModuleList(
73
- (0-25): 26 x FullyShardedDataParallel(
74
- (_fsdp_wrapped_module): CheckpointWrapper(
75
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
76
- (self_attn): SiglipFlashAttention2(
77
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
78
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
- )
82
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
83
- (mlp): SiglipMLP(
84
- (activation_fn): PytorchGELUTanh()
85
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
86
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
87
- )
88
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
89
- )
90
- )
91
- )
92
- )
93
- )
94
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
95
- )
96
- )
97
- )
98
- (connector): FullyShardedDataParallel(
99
- (_fsdp_wrapped_module): CheckpointWrapper(
100
- (_checkpoint_wrapped_module): MLPconnector(
101
- (activation_fn): PytorchGELUTanh()
102
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
103
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
104
- )
105
- )
106
- )
107
- (vit_pos_embed): FullyShardedDataParallel(
108
- (_fsdp_wrapped_module): PositionEmbedding()
109
- )
110
- )
111
- )
112
- _flat_param True
113
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- time_embedder._fsdp_wrapped_module._flat_param True
142
- latent_pos_embed._fsdp_wrapped_module._flat_param False
143
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
144
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
- vit_pos_embed._fsdp_wrapped_module._flat_param False
172
- Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
173
- Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
174
  wandb: Detected [huggingface_hub.inference] in use.
175
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
176
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1020,6 +847,179 @@ ImportError: cannot import name 'NaiveCache' from 'modeling.bagel' (/home/cloudu
1020
  [2026-01-03 11:25:55] (step=0000804) Train Loss mse: 0.0543, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
1021
  [2026-01-03 11:26:08] (step=0000805) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1022
  [2026-01-03 11:26:20] (step=0000806) Train Loss mse: 0.0528, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  [2026-01-03 11:26:33] (step=0000807) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1024
  [2026-01-03 11:26:50] (step=0000808) Train Loss mse: 0.0267, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1025
  [2026-01-03 11:27:03] (step=0000809) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
@@ -3097,33 +3097,7 @@ ImportError: cannot import name 'NaiveCache' from 'modeling.bagel' (/home/cloudu
3097
  [2026-01-03 19:16:05] (step=0002814) Train Loss mse: 0.0356, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3098
  [2026-01-03 19:16:18] (step=0002815) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3099
  [2026-01-03 19:16:34] (step=0002816) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3100
- [2026-01-03 19:16:48] (step=0002817) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3101
- [2026-01-03 19:17:01] (step=0002818) Train Loss mse: 0.0298, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3102
- [2026-01-03 19:17:14] (step=0002819) Train Loss mse: 0.0316, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3103
- [2026-01-03 19:17:27] (step=0002820) Train Loss mse: 0.0282, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3104
- [2026-01-03 19:17:40] (step=0002821) Train Loss mse: 0.0263, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3105
- [2026-01-03 19:17:54] (step=0002822) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3106
- [2026-01-03 19:18:05] (step=0002823) Train Loss mse: 0.0302, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3107
- [2026-01-03 19:18:17] (step=0002824) Train Loss mse: 0.0385, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3108
- [2026-01-03 19:18:33] (step=0002825) Train Loss mse: 0.0330, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3109
- [2026-01-03 19:18:46] (step=0002826) Train Loss mse: 0.0317, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3110
- [2026-01-03 19:19:02] (step=0002827) Train Loss mse: 0.0244, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3111
- [2026-01-03 19:19:15] (step=0002828) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3112
- [2026-01-03 19:19:29] (step=0002829) Train Loss mse: 0.0240, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3113
- [2026-01-03 19:19:40] (step=0002830) Train Loss mse: 0.0307, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3114
- [2026-01-03 19:19:56] (step=0002831) Train Loss mse: 0.0293, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3115
- [2026-01-03 19:20:10] (step=0002832) Train Loss mse: 0.0304, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3116
- [2026-01-03 19:20:26] (step=0002833) Train Loss mse: 0.0202, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3117
- [2026-01-03 19:20:38] (step=0002834) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3118
- [2026-01-03 19:20:51] (step=0002835) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3119
- [2026-01-03 19:21:06] (step=0002836) Train Loss mse: 0.0237, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3120
- [2026-01-03 19:21:19] (step=0002837) Train Loss mse: 0.0333, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3121
- [2026-01-03 19:21:33] (step=0002838) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3122
- [2026-01-03 19:21:47] (step=0002839) Train Loss mse: 0.0301, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3123
- [2026-01-03 19:21:59] (step=0002840) Train Loss mse: 0.0299, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3124
- [2026-01-03 19:22:14] (step=0002841) Train Loss mse: 0.0305, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3125
- [2026-01-03 19:22:28] (step=0002842) Train Loss mse: 0.0285, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3126
- [2026-01-03 19:22:44] (step=0002843) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3127
  [2026-01-03 19:22:57] (step=0002844) Train Loss mse: 0.0234, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3128
  [2026-01-03 19:23:13] (step=0002845) Train Loss mse: 0.0283, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3129
  [2026-01-03 19:23:29] (step=0002846) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
847
  [2026-01-03 11:25:55] (step=0000804) Train Loss mse: 0.0543, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
848
  [2026-01-03 11:26:08] (step=0000805) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
849
  [2026-01-03 11:26:20] (step=0000806) Train Loss mse: 0.0528, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
850
+ FullyShardedDataParallel(
851
+ (_fsdp_wrapped_module): Bagel(
852
+ (language_model): Qwen2ForCausalLM(
853
+ (model): Qwen2Model(
854
+ (embed_tokens): Embedding(152064, 3584)
855
+ (layers): ModuleList(
856
+ (0-27): 28 x FullyShardedDataParallel(
857
+ (_fsdp_wrapped_module): CheckpointWrapper(
858
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
859
+ (self_attn): PackedAttentionMoT(
860
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
861
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
862
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
863
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
864
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
865
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
866
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
867
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
868
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
869
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
870
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
871
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
872
+ )
873
+ (mlp): Qwen2MLP(
874
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
875
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
876
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
877
+ (act_fn): SiLU()
878
+ )
879
+ (mlp_moe_gen): Qwen2MLP(
880
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
881
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
882
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
883
+ (act_fn): SiLU()
884
+ )
885
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
886
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
887
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
888
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
889
+ )
890
+ )
891
+ )
892
+ )
893
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
894
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
895
+ (rotary_emb): Qwen2RotaryEmbedding()
896
+ )
897
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
898
+ )
899
+ (time_embedder): FullyShardedDataParallel(
900
+ (_fsdp_wrapped_module): TimestepEmbedder(
901
+ (mlp): Sequential(
902
+ (0): Linear(in_features=256, out_features=3584, bias=True)
903
+ (1): SiLU()
904
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
905
+ )
906
+ )
907
+ )
908
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
909
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
910
+ (latent_pos_embed): FullyShardedDataParallel(
911
+ (_fsdp_wrapped_module): PositionEmbedding()
912
+ )
913
+ (vit_model): SiglipVisionModel(
914
+ (vision_model): FullyShardedDataParallel(
915
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
916
+ (embeddings): SiglipVisionEmbeddings(
917
+ (position_embedding): Embedding(4900, 1152)
918
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
919
+ )
920
+ (encoder): SiglipEncoder(
921
+ (layers): ModuleList(
922
+ (0-25): 26 x FullyShardedDataParallel(
923
+ (_fsdp_wrapped_module): CheckpointWrapper(
924
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
925
+ (self_attn): SiglipFlashAttention2(
926
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
927
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
928
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
929
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
930
+ )
931
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
932
+ (mlp): SiglipMLP(
933
+ (activation_fn): PytorchGELUTanh()
934
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
935
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
936
+ )
937
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
938
+ )
939
+ )
940
+ )
941
+ )
942
+ )
943
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
944
+ )
945
+ )
946
+ )
947
+ (connector): FullyShardedDataParallel(
948
+ (_fsdp_wrapped_module): CheckpointWrapper(
949
+ (_checkpoint_wrapped_module): MLPconnector(
950
+ (activation_fn): PytorchGELUTanh()
951
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
952
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
953
+ )
954
+ )
955
+ )
956
+ (vit_pos_embed): FullyShardedDataParallel(
957
+ (_fsdp_wrapped_module): PositionEmbedding()
958
+ )
959
+ )
960
+ )
961
+ _flat_param True
962
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
963
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
964
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
965
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
966
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
967
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
968
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
969
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
970
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
971
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
972
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
973
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
974
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
975
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
976
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
977
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
978
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
979
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
980
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
981
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
982
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
983
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
984
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
985
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
986
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
987
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
988
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
989
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
990
+ time_embedder._fsdp_wrapped_module._flat_param True
991
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
992
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
993
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
994
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
995
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
996
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
997
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
998
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
999
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1000
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1001
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1002
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1003
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1004
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1005
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1006
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1007
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1008
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1009
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1010
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1011
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1012
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1013
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1014
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1015
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1016
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1017
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1018
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1019
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1020
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1021
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1022
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1023
  [2026-01-03 11:26:33] (step=0000807) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1024
  [2026-01-03 11:26:50] (step=0000808) Train Loss mse: 0.0267, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1025
  [2026-01-03 11:27:03] (step=0000809) Train Loss mse: 0.0418, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 
3097
  [2026-01-03 19:16:05] (step=0002814) Train Loss mse: 0.0356, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3098
  [2026-01-03 19:16:18] (step=0002815) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3099
  [2026-01-03 19:16:34] (step=0002816) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3100
+ [2026-01-03 19:16:48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3101
  [2026-01-03 19:22:57] (step=0002844) Train Loss mse: 0.0234, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3102
  [2026-01-03 19:23:13] (step=0002845) Train Loss mse: 0.0283, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3103
  [2026-01-03 19:23:29] (step=0002846) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/config.yaml ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.10
7
+ cli_version: 0.23.1
8
+ framework: huggingface
9
+ huggingface_version: 4.49.0
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1767517469
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 5
17
+ - 11
18
+ - 41
19
+ - 49
20
+ - 53
21
+ - 71
22
+ - 105
23
+ 2:
24
+ - 1
25
+ - 5
26
+ - 11
27
+ - 41
28
+ - 49
29
+ - 53
30
+ - 71
31
+ - 105
32
+ 3:
33
+ - 4
34
+ - 13
35
+ - 14
36
+ - 37
37
+ - 42
38
+ - 61
39
+ 4: 3.11.10
40
+ 5: 0.23.1
41
+ 6: 4.49.0
42
+ 13: linux-x86_64
43
+ e:
44
+ c4f1w52emnh3bkfwabjlnv9ozcfaekz0:
45
+ os: Linux-6.6.93+-x86_64-with-glibc2.35
46
+ python: CPython 3.11.10
47
+ started_at: '2026-01-04T09:04:29.298919Z'
48
+ args:
49
+ - --dataset_config_file
50
+ - ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
51
+ - --eval_dataset_config_file
52
+ - ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
53
+ - --viz_dataset_config_file
54
+ - ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
55
+ - --inference_hash_file
56
+ - /home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json
57
+ - --train_data_dir
58
+ - /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
59
+ - --train_jsonl_path
60
+ - /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
61
+ - --eval_data_dir
62
+ - /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
63
+ - --eval_jsonl_path
64
+ - /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
65
+ - --model_path
66
+ - /home/clouduser/Code/Models/BAGEL-7B-MoT
67
+ - --layer_module
68
+ - Qwen2MoTDecoderLayer
69
+ - --max_latent_size
70
+ - '64'
71
+ - --resume-from
72
+ - /home/clouduser/Code/Models/BAGEL-7B-MoT
73
+ - --finetune_from_hf
74
+ - 'True'
75
+ - --auto_resume
76
+ - 'False'
77
+ - --resume-model-only
78
+ - 'True'
79
+ - --finetune-from-ema
80
+ - 'True'
81
+ - --log_every
82
+ - '1'
83
+ - --lr
84
+ - 2e-5
85
+ - --warmup_steps
86
+ - '300'
87
+ - --lr_scheduler
88
+ - cosine
89
+ - --num_worker
90
+ - '1'
91
+ - --expected_num_tokens
92
+ - '20000'
93
+ - --max_num_tokens
94
+ - '20000'
95
+ - --max_num_tokens_per_sample
96
+ - '20000'
97
+ - --visual_und
98
+ - 'True'
99
+ - --save_every
100
+ - '2500'
101
+ - --total_steps
102
+ - '5000'
103
+ - --text_cond_dropout_prob
104
+ - '0.0'
105
+ - --vae_cond_dropout_prob
106
+ - '0.0'
107
+ - --vit_cond_dropout_prob
108
+ - '0.0'
109
+ - --checkpoint_dir
110
+ - /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
111
+ - --wandb_project
112
+ - bagel
113
+ - --wandb_name
114
+ - vlm_gym_jigsaw_one_img_lr2e_5_mse_only
115
+ - --wandb_dir
116
+ - /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
117
+ - --wandb_offline
118
+ - 'True'
119
+ - --viz_every
120
+ - '10'
121
+ program: /home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py
122
+ code_path: train/pretrain_unified_navit.py
123
+ code_path_local: train/pretrain_unified_navit.py
124
+ git:
125
+ remote_url: https://github.com/para-lost/unified_world_model
126
+ commit: be2c19982b710041da81a85f55c2877ea0e2e2c6
127
+ root: /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
128
+ host: junyizhang-launch-new-219635706-1-0
129
+ executable: /opt/conda/bin/python3.11
130
+ cpu_count: 48
131
+ cpu_count_logical: 96
132
+ gpu_type: NVIDIA A100-SXM4-80GB
133
+ gpu_count: 8
134
+ disk:
135
+ /:
136
+ total: '1052461830144'
137
+ used: '261623291904'
138
+ memory:
139
+ total: '1437332611072'
140
+ gpu_nvidia:
141
+ - name: NVIDIA A100-SXM4-80GB
142
+ memory_total: '85899345920'
143
+ cuda_cores: 6912
144
+ architecture: Ampere
145
+ uuid: GPU-71d2c6f0-c9e9-2110-f69b-f7fd558363b2
146
+ - name: NVIDIA A100-SXM4-80GB
147
+ memory_total: '85899345920'
148
+ cuda_cores: 6912
149
+ architecture: Ampere
150
+ uuid: GPU-a25620fe-6924-8936-d5a8-9dfb1c7177e8
151
+ - name: NVIDIA A100-SXM4-80GB
152
+ memory_total: '85899345920'
153
+ cuda_cores: 6912
154
+ architecture: Ampere
155
+ uuid: GPU-2b69635d-5f31-ec83-bcde-b1df07b60307
156
+ - name: NVIDIA A100-SXM4-80GB
157
+ memory_total: '85899345920'
158
+ cuda_cores: 6912
159
+ architecture: Ampere
160
+ uuid: GPU-4469aac3-d575-de3d-8715-1c34b68c640b
161
+ - name: NVIDIA A100-SXM4-80GB
162
+ memory_total: '85899345920'
163
+ cuda_cores: 6912
164
+ architecture: Ampere
165
+ uuid: GPU-da768b0d-e500-f726-164e-2e2379616f19
166
+ - name: NVIDIA A100-SXM4-80GB
167
+ memory_total: '85899345920'
168
+ cuda_cores: 6912
169
+ architecture: Ampere
170
+ uuid: GPU-ae036ce5-57c1-a8df-01b2-21cf23bc619b
171
+ - name: NVIDIA A100-SXM4-80GB
172
+ memory_total: '85899345920'
173
+ cuda_cores: 6912
174
+ architecture: Ampere
175
+ uuid: GPU-d1ab738b-49ca-ed1f-6700-5336be458e1f
176
+ - name: NVIDIA A100-SXM4-80GB
177
+ memory_total: '85899345920'
178
+ cuda_cores: 6912
179
+ architecture: Ampere
180
+ uuid: GPU-cb0ff8e1-e17f-38b9-bc14-4dcc9465b322
181
+ cuda_version: '12.2'
182
+ writer_id: c4f1w52emnh3bkfwabjlnv9ozcfaekz0
183
+ visual_gen:
184
+ desc: null
185
+ value: true
186
+ visual_und:
187
+ desc: null
188
+ value: true
189
+ results_dir:
190
+ desc: null
191
+ value: results
192
+ checkpoint_dir:
193
+ desc: null
194
+ value: /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
195
+ wandb_project:
196
+ desc: null
197
+ value: bagel
198
+ wandb_name:
199
+ desc: null
200
+ value: vlm_gym_jigsaw_one_img_lr2e_5_mse_only
201
+ wandb_runid:
202
+ desc: null
203
+ value: '0'
204
+ wandb_resume:
205
+ desc: null
206
+ value: allow
207
+ wandb_offline:
208
+ desc: null
209
+ value: true
210
+ wandb_dir:
211
+ desc: null
212
+ value: /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test
213
+ global_seed:
214
+ desc: null
215
+ value: 4396
216
+ auto_resume:
217
+ desc: null
218
+ value: false
219
+ resume_from:
220
+ desc: null
221
+ value: /home/clouduser/Code/Models/BAGEL-7B-MoT
222
+ resume_model_only:
223
+ desc: null
224
+ value: true
225
+ finetune_from_ema:
226
+ desc: null
227
+ value: true
228
+ finetune_from_hf:
229
+ desc: null
230
+ value: true
231
+ log_every:
232
+ desc: null
233
+ value: 1
234
+ save_every:
235
+ desc: null
236
+ value: 2500
237
+ total_steps:
238
+ desc: null
239
+ value: 5000
240
+ warmup_steps:
241
+ desc: null
242
+ value: 300
243
+ lr_scheduler:
244
+ desc: null
245
+ value: cosine
246
+ lr:
247
+ desc: null
248
+ value: 2.0e-05
249
+ min_lr:
250
+ desc: null
251
+ value: 1.0e-07
252
+ beta1:
253
+ desc: null
254
+ value: 0.9
255
+ beta2:
256
+ desc: null
257
+ value: 0.95
258
+ eps:
259
+ desc: null
260
+ value: 1.0e-15
261
+ ema:
262
+ desc: null
263
+ value: 0.993
264
+ max_grad_norm:
265
+ desc: null
266
+ value: 1.0
267
+ timestep_shift:
268
+ desc: null
269
+ value: 1.0
270
+ mse_weight:
271
+ desc: null
272
+ value: 1.0
273
+ ce_weight:
274
+ desc: null
275
+ value: 1.0
276
+ ce_loss_reweighting:
277
+ desc: null
278
+ value: false
279
+ expected_num_tokens:
280
+ desc: null
281
+ value: 20000
282
+ num_replicate:
283
+ desc: null
284
+ value: 1
285
+ num_shard:
286
+ desc: null
287
+ value: 8
288
+ sharding_strategy:
289
+ desc: null
290
+ value: HYBRID_SHARD
291
+ backward_prefetch:
292
+ desc: null
293
+ value: BACKWARD_PRE
294
+ cpu_offload:
295
+ desc: null
296
+ value: false
297
+ freeze_llm:
298
+ desc: null
299
+ value: false
300
+ freeze_vit:
301
+ desc: null
302
+ value: false
303
+ freeze_vae:
304
+ desc: null
305
+ value: true
306
+ freeze_und:
307
+ desc: null
308
+ value: false
309
+ copy_init_moe:
310
+ desc: null
311
+ value: true
312
+ use_flex:
313
+ desc: null
314
+ value: false
315
+ eval_every:
316
+ desc: null
317
+ value: 500
318
+ num_eval_batches:
319
+ desc: null
320
+ value: 20
321
+ use_ema_for_eval:
322
+ desc: null
323
+ value: true
324
+ viz_every:
325
+ desc: null
326
+ value: 10
327
+ viz_n:
328
+ desc: null
329
+ value: 8
330
+ viz_outdir:
331
+ desc: null
332
+ value: results/viz
333
+ eval_dataset_config_file:
334
+ desc: null
335
+ value: ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
336
+ viz_dataset_config_file:
337
+ desc: null
338
+ value: ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
339
+ save_ema_only:
340
+ desc: null
341
+ value: true
342
+ save_optimizer:
343
+ desc: null
344
+ value: false
345
+ model_path:
346
+ desc: null
347
+ value: /home/clouduser/Code/Models/BAGEL-7B-MoT
348
+ llm_path:
349
+ desc: null
350
+ value: hf/Qwen2.5-0.5B-Instruct/
351
+ llm_qk_norm:
352
+ desc: null
353
+ value: true
354
+ tie_word_embeddings:
355
+ desc: null
356
+ value: false
357
+ layer_module:
358
+ desc: null
359
+ value: Qwen2MoTDecoderLayer
360
+ vae_path:
361
+ desc: null
362
+ value: flux/vae/ae.safetensors
363
+ vit_path:
364
+ desc: null
365
+ value: hf/siglip-so400m-14-980-flash-attn2-navit/
366
+ max_latent_size:
367
+ desc: null
368
+ value: 64
369
+ latent_patch_size:
370
+ desc: null
371
+ value: 2
372
+ vit_patch_size:
373
+ desc: null
374
+ value: 14
375
+ vit_max_num_patch_per_side:
376
+ desc: null
377
+ value: 70
378
+ connector_act:
379
+ desc: null
380
+ value: gelu_pytorch_tanh
381
+ interpolate_pos:
382
+ desc: null
383
+ value: false
384
+ vit_select_layer:
385
+ desc: null
386
+ value: -2
387
+ vit_rope:
388
+ desc: null
389
+ value: false
390
+ text_cond_dropout_prob:
391
+ desc: null
392
+ value: 0.0
393
+ vae_cond_dropout_prob:
394
+ desc: null
395
+ value: 0.0
396
+ vit_cond_dropout_prob:
397
+ desc: null
398
+ value: 0.0
399
+ dataset_config_file:
400
+ desc: null
401
+ value: ./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml
402
+ train_data_dir:
403
+ desc: null
404
+ value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
405
+ train_jsonl_path:
406
+ desc: null
407
+ value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/train/
408
+ eval_data_dir:
409
+ desc: null
410
+ value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
411
+ eval_jsonl_path:
412
+ desc: null
413
+ value: /home/clouduser/Code/data/gym/jigsaw-swap_v5/val/
414
+ inference_hash_file:
415
+ desc: null
416
+ value: /home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json
417
+ prefetch_factor:
418
+ desc: null
419
+ value: 2
420
+ num_workers:
421
+ desc: null
422
+ value: 1
423
+ max_num_tokens_per_sample:
424
+ desc: null
425
+ value: 20000
426
+ max_num_tokens:
427
+ desc: null
428
+ value: 20000
429
+ prefer_buffer_before:
430
+ desc: null
431
+ value: 16384
432
+ max_buffer_size:
433
+ desc: null
434
+ value: 50
435
+ data_seed:
436
+ desc: null
437
+ value: 42
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log ADDED
@@ -0,0 +1,601 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FullyShardedDataParallel(
2
+ (_fsdp_wrapped_module): Bagel(
3
+ (language_model): Qwen2ForCausalLM(
4
+ (model): Qwen2Model(
5
+ (embed_tokens): Embedding(152064, 3584)
6
+ (layers): ModuleList(
7
+ (0-27): 28 x FullyShardedDataParallel(
8
+ (_fsdp_wrapped_module): CheckpointWrapper(
9
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
10
+ (self_attn): PackedAttentionMoT(
11
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
12
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
13
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
14
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
15
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
16
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
18
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
20
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
21
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
23
+ )
24
+ (mlp): Qwen2MLP(
25
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
26
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
28
+ (act_fn): SiLU()
29
+ )
30
+ (mlp_moe_gen): Qwen2MLP(
31
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
32
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
34
+ (act_fn): SiLU()
35
+ )
36
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
37
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
38
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
39
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
40
+ )
41
+ )
42
+ )
43
+ )
44
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
45
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
46
+ (rotary_emb): Qwen2RotaryEmbedding()
47
+ )
48
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
49
+ )
50
+ (time_embedder): FullyShardedDataParallel(
51
+ (_fsdp_wrapped_module): TimestepEmbedder(
52
+ (mlp): Sequential(
53
+ (0): Linear(in_features=256, out_features=3584, bias=True)
54
+ (1): SiLU()
55
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
56
+ )
57
+ )
58
+ )
59
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
60
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
61
+ (latent_pos_embed): FullyShardedDataParallel(
62
+ (_fsdp_wrapped_module): PositionEmbedding()
63
+ )
64
+ (vit_model): SiglipVisionModel(
65
+ (vision_model): FullyShardedDataParallel(
66
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
67
+ (embeddings): SiglipVisionEmbeddings(
68
+ (position_embedding): Embedding(4900, 1152)
69
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
70
+ )
71
+ (encoder): SiglipEncoder(
72
+ (layers): ModuleList(
73
+ (0-25): 26 x FullyShardedDataParallel(
74
+ (_fsdp_wrapped_module): CheckpointWrapper(
75
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
76
+ (self_attn): SiglipFlashAttention2(
77
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
78
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
+ )
82
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
83
+ (mlp): SiglipMLP(
84
+ (activation_fn): PytorchGELUTanh()
85
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
86
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
87
+ )
88
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
89
+ )
90
+ )
91
+ )
92
+ )
93
+ )
94
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
95
+ )
96
+ )
97
+ )
98
+ (connector): FullyShardedDataParallel(
99
+ (_fsdp_wrapped_module): CheckpointWrapper(
100
+ (_checkpoint_wrapped_module): MLPconnector(
101
+ (activation_fn): PytorchGELUTanh()
102
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
103
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
104
+ )
105
+ )
106
+ )
107
+ (vit_pos_embed): FullyShardedDataParallel(
108
+ (_fsdp_wrapped_module): PositionEmbedding()
109
+ )
110
+ )
111
+ )
112
+ _flat_param True
113
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
114
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
+ time_embedder._fsdp_wrapped_module._flat_param True
142
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
143
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
144
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
145
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
172
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
173
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
174
+ wandb: Detected [huggingface_hub.inference] in use.
175
+ wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
176
+ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
177
+ [2026-01-04 09:04:35] Training arguments TrainingArguments(visual_gen=True, visual_und=True, results_dir='results', checkpoint_dir='/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', wandb_project='bagel', wandb_name='vlm_gym_jigsaw_one_img_lr2e_5_mse_only', wandb_runid='0', wandb_resume='allow', wandb_offline=True, wandb_dir='/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', global_seed=4396, auto_resume=False, resume_from='/home/clouduser/Code/Models/BAGEL-7B-MoT', resume_model_only=True, finetune_from_ema=True, finetune_from_hf=True, log_every=1, save_every=2500, total_steps=5000, warmup_steps=300, lr_scheduler='cosine', lr=2e-05, min_lr=1e-07, beta1=0.9, beta2=0.95, eps=1e-15, ema=0.993, max_grad_norm=1.0, timestep_shift=1.0, mse_weight=1.0, ce_weight=1.0, ce_loss_reweighting=False, expected_num_tokens=20000, num_replicate=1, num_shard=8, sharding_strategy='HYBRID_SHARD', backward_prefetch='BACKWARD_PRE', cpu_offload=False, freeze_llm=False, freeze_vit=False, freeze_vae=True, freeze_und=False, copy_init_moe=True, use_flex=False, eval_every=500, num_eval_batches=20, use_ema_for_eval=True, viz_every=10, viz_n=8, viz_outdir='results/viz', eval_dataset_config_file='./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', viz_dataset_config_file='./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', save_ema_only=True, save_optimizer=False)
178
+ [2026-01-04 09:04:35] Model arguments ModelArguments(model_path='/home/clouduser/Code/Models/BAGEL-7B-MoT', llm_path='hf/Qwen2.5-0.5B-Instruct/', llm_qk_norm=True, tie_word_embeddings=False, layer_module='Qwen2MoTDecoderLayer', vae_path='flux/vae/ae.safetensors', vit_path='hf/siglip-so400m-14-980-flash-attn2-navit/', max_latent_size=64, latent_patch_size=2, vit_patch_size=14, vit_max_num_patch_per_side=70, connector_act='gelu_pytorch_tanh', interpolate_pos=False, vit_select_layer=-2, vit_rope=False, text_cond_dropout_prob=0.0, vae_cond_dropout_prob=0.0, vit_cond_dropout_prob=0.0)
179
+ [2026-01-04 09:04:35] Data arguments DataArguments(dataset_config_file='./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', train_data_dir='/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', train_jsonl_path='/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', eval_data_dir='/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', eval_jsonl_path='/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', inference_hash_file='/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', prefetch_factor=2, num_workers=1, max_num_tokens_per_sample=20000, max_num_tokens=20000, prefer_buffer_before=16384, max_buffer_size=50, data_seed=42)
180
+ [2026-01-04 09:09:01] Loading checkpoint from /home/clouduser/Code/Models/BAGEL-7B-MoT.
181
+ [2026-01-04 09:09:12] _IncompatibleKeys(missing_keys=['latent_pos_embed.pos_embed'], unexpected_keys=[])
182
+ [2026-01-04 09:09:29] _IncompatibleKeys(missing_keys=['latent_pos_embed.pos_embed'], unexpected_keys=[])
183
+ [2026-01-04 09:10:03] Training for 5000 steps, starting at 0...
184
+ [2026-01-04 09:10:44] (step=0000000) Train Loss mse: 0.0571, Train Loss ce: 0.0000, Train Steps/Sec: 0.02,
185
+ Traceback (most recent call last):
186
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
187
+ outputs = model(
188
+ ^^^^^^
189
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
190
+ return self._call_impl(*args, **kwargs)
191
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
192
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
193
+ return forward_call(*args, **kwargs)
194
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
195
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
196
+ packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
197
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
198
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
199
+ return self._call_impl(*args, **kwargs)
200
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
201
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
202
+ return forward_call(*args, **kwargs)
203
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
204
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
205
+ return F.embedding(
206
+ ^^^^^^^^^^^^
207
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
208
+ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
209
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
210
+ RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
211
+ If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
212
+ If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
213
+ Traceback (most recent call last):
214
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
215
+ outputs = model(
216
+ ^^^^^^
217
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
218
+ return self._call_impl(*args, **kwargs)
219
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
220
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
221
+ return forward_call(*args, **kwargs)
222
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
223
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
224
+ packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
225
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
226
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
227
+ return self._call_impl(*args, **kwargs)
228
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
229
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
230
+ return forward_call(*args, **kwargs)
231
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
232
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
233
+ return F.embedding(
234
+ ^^^^^^^^^^^^
235
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
236
+ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
237
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
238
+ RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
239
+ If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
240
+ If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
241
+ Traceback (most recent call last):
242
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
243
+ outputs = model(
244
+ ^^^^^^
245
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
246
+ return self._call_impl(*args, **kwargs)
247
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
248
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
249
+ return forward_call(*args, **kwargs)
250
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
251
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
252
+ packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
253
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
254
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
255
+ return self._call_impl(*args, **kwargs)
256
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
257
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
258
+ return forward_call(*args, **kwargs)
259
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
261
+ return F.embedding(
262
+ ^^^^^^^^^^^^
263
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
264
+ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
265
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
266
+ RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
267
+ If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
268
+ If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
269
+ Traceback (most recent call last):
270
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
271
+ outputs = model(
272
+ ^^^^^^
273
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
274
+ return self._call_impl(*args, **kwargs)
275
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
276
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
277
+ return forward_call(*args, **kwargs)
278
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
279
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 151, in forward
280
+ packed_text_embedding = self.language_model.model.embed_tokens(packed_text_ids)
281
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
282
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
283
+ return self._call_impl(*args, **kwargs)
284
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
285
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
286
+ return forward_call(*args, **kwargs)
287
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
288
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
289
+ return F.embedding(
290
+ ^^^^^^^^^^^^
291
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
292
+ return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
293
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
294
+ RuntimeError: The tensor has a non-zero number of elements, but its data is not allocated yet.
295
+ If you're using torch.compile/export/fx, it is likely that we are erroneously tracing into a custom kernel. To fix this, please wrap the custom kernel into an opaque custom op. Please see the following for details: https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html
296
+ If you're using Caffe2, Caffe2 uses a lazy allocation, so you will need to call mutable_data() or raw_mutable_data() to actually allocate memory.
297
+ [2026-01-04 09:12:39] (step=0000001) Train Loss mse: 0.0559, Train Loss ce: 0.0000, Train Steps/Sec: 0.01,
298
+ [2026-01-04 09:12:52] (step=0000002) Train Loss mse: 0.0621, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
299
+ [2026-01-04 09:13:06] (step=0000003) Train Loss mse: 0.0709, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
300
+ [2026-01-04 09:13:19] (step=0000004) Train Loss mse: 0.0585, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
301
+ [2026-01-04 09:13:33] (step=0000005) Train Loss mse: 0.0523, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
302
+ [2026-01-04 09:13:49] (step=0000006) Train Loss mse: 0.0602, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
303
+ [2026-01-04 09:14:03] (step=0000007) Train Loss mse: 0.0612, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
304
+ [2026-01-04 09:14:19] (step=0000008) Train Loss mse: 0.0432, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
305
+ [2026-01-04 09:14:35] (step=0000009) Train Loss mse: 0.0561, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
306
+ [2026-01-04 09:14:47] (step=0000010) Train Loss mse: 0.0673, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
307
+ Traceback (most recent call last):
308
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
309
+ outputs = model(
310
+ ^^^^^^
311
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
312
+ return self._call_impl(*args, **kwargs)
313
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
314
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
315
+ return forward_call(*args, **kwargs)
316
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
317
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
318
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
319
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
320
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
321
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
322
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
323
+ TypeError: 'NoneType' object is not iterable
324
+ Traceback (most recent call last):
325
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
326
+ outputs = model(
327
+ ^^^^^^
328
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
329
+ return self._call_impl(*args, **kwargs)
330
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
331
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
332
+ return forward_call(*args, **kwargs)
333
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
334
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
335
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
336
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
337
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
338
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
339
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
340
+ TypeError: 'NoneType' object is not iterable
341
+ Traceback (most recent call last):
342
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
343
+ outputs = model(
344
+ ^^^^^^
345
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
346
+ return self._call_impl(*args, **kwargs)
347
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
348
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
349
+ return forward_call(*args, **kwargs)
350
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
351
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
352
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
353
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
354
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
355
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
356
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
357
+ TypeError: 'NoneType' object is not iterable
358
+ Traceback (most recent call last):
359
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
360
+ outputs = model(
361
+ ^^^^^^
362
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
363
+ return self._call_impl(*args, **kwargs)
364
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
365
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
366
+ return forward_call(*args, **kwargs)
367
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
368
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
369
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
370
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
371
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
372
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
373
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
374
+ TypeError: 'NoneType' object is not iterable
375
+ [2026-01-04 09:15:16] (step=0000011) Train Loss mse: 0.0475, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
376
+ [2026-01-04 09:15:29] (step=0000012) Train Loss mse: 0.0573, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
377
+ [2026-01-04 09:15:41] (step=0000013) Train Loss mse: 0.0592, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
378
+ [2026-01-04 09:15:57] (step=0000014) Train Loss mse: 0.0525, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
379
+ [2026-01-04 09:16:11] (step=0000015) Train Loss mse: 0.0574, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
380
+ [2026-01-04 09:16:27] (step=0000016) Train Loss mse: 0.0515, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
381
+ [2026-01-04 09:16:43] (step=0000017) Train Loss mse: 0.0759, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
382
+ [2026-01-04 09:16:56] (step=0000018) Train Loss mse: 0.0802, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
383
+ [2026-01-04 09:17:12] (step=0000019) Train Loss mse: 0.0643, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
384
+ [2026-01-04 09:17:28] (step=0000020) Train Loss mse: 0.0476, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
385
+ Traceback (most recent call last):
386
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
387
+ outputs = model(
388
+ ^^^^^^
389
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
390
+ return self._call_impl(*args, **kwargs)
391
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
392
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
393
+ return forward_call(*args, **kwargs)
394
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
395
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
396
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
397
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
398
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
399
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
400
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
401
+ TypeError: 'NoneType' object is not iterable
402
+ Traceback (most recent call last):
403
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
404
+ outputs = model(
405
+ ^^^^^^
406
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
407
+ return self._call_impl(*args, **kwargs)
408
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
409
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
410
+ return forward_call(*args, **kwargs)
411
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
412
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
413
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
414
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
415
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
416
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
417
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
418
+ TypeError: 'NoneType' object is not iterable
419
+ Traceback (most recent call last):
420
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
421
+ outputs = model(
422
+ ^^^^^^
423
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
424
+ return self._call_impl(*args, **kwargs)
425
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
426
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
427
+ return forward_call(*args, **kwargs)
428
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
429
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
430
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
431
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
432
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
433
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
434
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
435
+ TypeError: 'NoneType' object is not iterable
436
+ Traceback (most recent call last):
437
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
438
+ outputs = model(
439
+ ^^^^^^
440
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
441
+ return self._call_impl(*args, **kwargs)
442
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
443
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
444
+ return forward_call(*args, **kwargs)
445
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
446
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
447
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
448
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
449
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
450
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
451
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
452
+ TypeError: 'NoneType' object is not iterable
453
+ [2026-01-04 09:17:58] (step=0000021) Train Loss mse: 0.0642, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
454
+ [2026-01-04 09:18:11] (step=0000022) Train Loss mse: 0.0536, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
455
+ [2026-01-04 09:18:27] (step=0000023) Train Loss mse: 0.0590, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
456
+ [2026-01-04 09:18:40] (step=0000024) Train Loss mse: 0.0534, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
457
+ [2026-01-04 09:18:56] (step=0000025) Train Loss mse: 0.0469, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
458
+ [2026-01-04 09:19:09] (step=0000026) Train Loss mse: 0.0495, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
459
+ [2026-01-04 09:19:25] (step=0000027) Train Loss mse: 0.0638, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
460
+ [2026-01-04 09:19:38] (step=0000028) Train Loss mse: 0.0685, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
461
+ [2026-01-04 09:19:52] (step=0000029) Train Loss mse: 0.0469, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
462
+ [2026-01-04 09:20:08] (step=0000030) Train Loss mse: 0.0546, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
463
+ Traceback (most recent call last):
464
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
465
+ outputs = model(
466
+ ^^^^^^
467
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
468
+ return self._call_impl(*args, **kwargs)
469
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
470
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
471
+ return forward_call(*args, **kwargs)
472
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
473
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
474
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
475
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
476
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
477
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
478
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
479
+ TypeError: 'NoneType' object is not iterable
480
+ Traceback (most recent call last):
481
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
482
+ outputs = model(
483
+ ^^^^^^
484
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
485
+ return self._call_impl(*args, **kwargs)
486
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
487
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
488
+ return forward_call(*args, **kwargs)
489
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
490
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
491
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
492
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
493
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
494
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
495
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
496
+ TypeError: 'NoneType' object is not iterable
497
+ Traceback (most recent call last):
498
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
499
+ outputs = model(
500
+ ^^^^^^
501
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
502
+ return self._call_impl(*args, **kwargs)
503
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
504
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
505
+ return forward_call(*args, **kwargs)
506
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
507
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
508
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
509
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
510
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
511
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
512
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
513
+ TypeError: 'NoneType' object is not iterable
514
+ Traceback (most recent call last):
515
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
516
+ outputs = model(
517
+ ^^^^^^
518
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
519
+ return self._call_impl(*args, **kwargs)
520
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
521
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
522
+ return forward_call(*args, **kwargs)
523
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
524
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
525
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
526
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
527
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
528
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
529
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
530
+ TypeError: 'NoneType' object is not iterable
531
+ [2026-01-04 09:20:40] (step=0000031) Train Loss mse: 0.0437, Train Loss ce: 0.0000, Train Steps/Sec: 0.03,
532
+ [2026-01-04 09:20:53] (step=0000032) Train Loss mse: 0.0544, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
533
+ [2026-01-04 09:21:09] (step=0000033) Train Loss mse: 0.0477, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
534
+ [2026-01-04 09:21:26] (step=0000034) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
535
+ [2026-01-04 09:21:39] (step=0000035) Train Loss mse: 0.0571, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
536
+ [2026-01-04 09:21:55] (step=0000036) Train Loss mse: 0.0632, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
537
+ [2026-01-04 09:22:09] (step=0000037) Train Loss mse: 0.0479, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
538
+ [2026-01-04 09:22:25] (step=0000038) Train Loss mse: 0.0481, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
539
+ [2026-01-04 09:22:41] (step=0000039) Train Loss mse: 0.0573, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
540
+ [2026-01-04 09:22:53] (step=0000040) Train Loss mse: 0.0544, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
541
+ Traceback (most recent call last):
542
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
543
+ outputs = model(
544
+ ^^^^^^
545
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
546
+ return self._call_impl(*args, **kwargs)
547
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
548
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
549
+ return forward_call(*args, **kwargs)
550
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
551
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
552
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
553
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
554
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
555
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
556
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
557
+ TypeError: 'NoneType' object is not iterable
558
+ Traceback (most recent call last):
559
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
560
+ outputs = model(
561
+ ^^^^^^
562
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
563
+ return self._call_impl(*args, **kwargs)
564
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
565
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
566
+ return forward_call(*args, **kwargs)
567
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
568
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
569
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
570
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
571
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
572
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
573
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
574
+ TypeError: 'NoneType' object is not iterable
575
+ Traceback (most recent call last):
576
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
577
+ outputs = model(
578
+ ^^^^^^
579
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
580
+ return self._call_impl(*args, **kwargs)
581
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
582
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
583
+ return forward_call(*args, **kwargs)
584
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
585
+ File "/home/clouduser/Code/Github/unified_world_model/modeling/bagel/bagel.py", line 156, in forward
586
+ sparse_mask = create_sparse_mask(sample_lens, split_lens, attn_modes, packed_text_embedding.device)
587
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
588
+ File "/home/clouduser/Code/Github/unified_world_model/data/data_utils.py", line 29, in create_sparse_mask
589
+ for i, (length, model) in enumerate(zip(split_lens, attn_modes)):
590
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^
591
+ TypeError: 'NoneType' object is not iterable
592
+ Traceback (most recent call last):
593
+ File "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", line 713, in dump_visual_viz
594
+ outputs = model(
595
+ ^^^^^^
596
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
597
+ return self._call_impl(*args, **kwargs)
598
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
599
+ File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
600
+ return forward_call(*args, **kwargs)
601
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/wandb-metadata.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"os": "Linux-6.6.93+-x86_64-with-glibc2.35", "python": "CPython 3.11.10", "started_at": "2026-01-04T09:04:29.298919Z", "args": ["--dataset_config_file", "./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml", "--eval_dataset_config_file", "./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml", "--viz_dataset_config_file", "./data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml", "--inference_hash_file", "/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json", "--train_data_dir", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/", "--train_jsonl_path", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/", "--eval_data_dir", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/", "--eval_jsonl_path", "/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/", "--model_path", "/home/clouduser/Code/Models/BAGEL-7B-MoT", "--layer_module", "Qwen2MoTDecoderLayer", "--max_latent_size", "64", "--resume-from", "/home/clouduser/Code/Models/BAGEL-7B-MoT", "--finetune_from_hf", "True", "--auto_resume", "False", "--resume-model-only", "True", "--finetune-from-ema", "True", "--log_every", "1", "--lr", "2e-5", "--warmup_steps", "300", "--lr_scheduler", "cosine", "--num_worker", "1", "--expected_num_tokens", "20000", "--max_num_tokens", "20000", "--max_num_tokens_per_sample", "20000", "--visual_und", "True", "--save_every", "2500", "--total_steps", "5000", "--text_cond_dropout_prob", "0.0", "--vae_cond_dropout_prob", "0.0", "--vit_cond_dropout_prob", "0.0", "--checkpoint_dir", "/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test", "--wandb_project", "bagel", "--wandb_name", "vlm_gym_jigsaw_one_img_lr2e_5_mse_only", "--wandb_dir", "/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test", "--wandb_offline", "True", "--viz_every", "10"], "program": "/home/clouduser/Code/Github/unified_world_model/train/pretrain_unified_navit.py", "code_path": "train/pretrain_unified_navit.py", "code_path_local": "train/pretrain_unified_navit.py", "git": {"remote_url": "https://github.com/para-lost/unified_world_model", "commit": "be2c19982b710041da81a85f55c2877ea0e2e2c6"}, "root": "/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test", "host": "junyizhang-launch-new-219635706-1-0", "executable": "/opt/conda/bin/python3.11", "cpu_count": 48, "cpu_count_logical": 96, "gpu_type": "NVIDIA A100-SXM4-80GB", "gpu_count": 8, "disk": {"/": {"total": "1052461830144", "used": "261623291904"}}, "memory": {"total": "1437332611072"}, "gpu_nvidia": [{"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-71d2c6f0-c9e9-2110-f69b-f7fd558363b2"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-a25620fe-6924-8936-d5a8-9dfb1c7177e8"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-2b69635d-5f31-ec83-bcde-b1df07b60307"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-4469aac3-d575-de3d-8715-1c34b68c640b"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-da768b0d-e500-f726-164e-2e2379616f19"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-ae036ce5-57c1-a8df-01b2-21cf23bc619b"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-d1ab738b-49ca-ed1f-6700-5336be458e1f"}, {"name": "NVIDIA A100-SXM4-80GB", "memory_total": "85899345920", "cuda_cores": 6912, "architecture": "Ampere", "uuid": "GPU-cb0ff8e1-e17f-38b9-bc14-4dcc9465b322"}], "cuda_version": "12.2", "writer_id": "c4f1w52emnh3bkfwabjlnv9ozcfaekz0"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_runtime": 1103.04216952, "total_samples": 11, "mem_cache": 77354, "_timestamp": 1767518561.3092997, "eval/ce": 0, "mse": 0.05725831538438797, "ce": 0, "mem_allocated": 51190.20703125, "eval/mse": 0.06041467562317848, "_step": 39, "lr": 2.666666666666667e-06, "total_mse_tokens": 51200, "total_ce_tokens": 0, "total_norm": 0.22510729730129242}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log CHANGED
@@ -4,3 +4,4 @@
4
  {"time":"2026-01-04T09:04:29.745202354Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
  {"time":"2026-01-04T09:04:29.757743673Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
6
  {"time":"2026-01-04T09:04:30.242343786Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
 
 
4
  {"time":"2026-01-04T09:04:29.745202354Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
  {"time":"2026-01-04T09:04:29.757743673Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
6
  {"time":"2026-01-04T09:04:30.242343786Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
7
+ {"time":"2026-01-04T09:28:49.06895492Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_090429-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9db8970c306c984926cadb3d1739cc2ad4bd53712ed9b9744f2f84d7b4e35ca
3
- size 393216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9140cc820e3e9fffcb43c8f00e85dd36eaa4870b83cc86639f3359dbdfa6f7fe
3
+ size 491520
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Brotli==1.1.0
2
+ MarkupSafe==3.0.2
3
+ PySocks==1.7.1
4
+ PyYAML==6.0.2
5
+ archspec==0.2.3
6
+ asttokens==2.4.1
7
+ astunparse==1.6.3
8
+ attrs==24.2.0
9
+ beautifulsoup4==4.12.3
10
+ boltons==24.0.0
11
+ certifi==2024.8.30
12
+ chardet==5.2.0
13
+ charset-normalizer==3.4.0
14
+ click==8.1.7
15
+ colorama==0.4.6
16
+ conda==24.9.2
17
+ conda-build==24.9.0
18
+ conda_index==0.5.0
19
+ conda-libmamba-solver==24.9.0
20
+ conda-package-handling==2.4.0
21
+ conda_package_streaming==0.11.0
22
+ decorator==5.1.1
23
+ distro==1.9.0
24
+ dnspython==2.7.0
25
+ exceptiongroup==1.2.2
26
+ executing==2.1.0
27
+ expecttest==0.2.1
28
+ filelock==3.16.1
29
+ frozendict==2.4.6
30
+ fsspec==2024.10.0
31
+ h2==4.1.0
32
+ hpack==4.0.0
33
+ hyperframe==6.0.1
34
+ hypothesis==6.115.5
35
+ idna==3.10
36
+ importlib_resources==6.4.5
37
+ ipython==8.29.0
38
+ jedi==0.19.1
39
+ Jinja2==3.1.4
40
+ jsonpatch==1.33
41
+ jsonpointer==3.0.0
42
+ jsonschema==4.23.0
43
+ jsonschema-specifications==2024.10.1
44
+ libarchive-c==5.1
45
+ libmambapy==1.5.10
46
+ lief==0.14.1
47
+ lintrunner==0.12.5
48
+ mamba==1.5.10
49
+ matplotlib-inline==0.1.7
50
+ menuinst==2.1.2
51
+ more-itertools==10.5.0
52
+ mpmath==1.3.0
53
+ networkx==3.4.2
54
+ ninja==1.11.1.1
55
+ nvidia-cublas-cu12==12.4.5.8
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cuda-nvrtc-cu12==12.4.127
58
+ nvidia-cuda-runtime-cu12==12.4.127
59
+ nvidia-cudnn-cu12==9.1.0.70
60
+ nvidia-cufft-cu12==11.2.1.3
61
+ nvidia-curand-cu12==10.3.5.147
62
+ nvidia-cusolver-cu12==11.6.1.9
63
+ nvidia-cusparse-cu12==12.3.1.170
64
+ nvidia-nccl-cu12==2.21.5
65
+ nvidia-nvjitlink-cu12==12.4.127
66
+ nvidia-nvtx-cu12==12.4.127
67
+ optree==0.13.0
68
+ parso==0.8.4
69
+ pexpect==4.9.0
70
+ pickleshare==0.7.5
71
+ pillow==10.2.0
72
+ pkginfo==1.11.2
73
+ pkgutil_resolve_name==1.3.10
74
+ platformdirs==4.3.6
75
+ pluggy==1.5.0
76
+ prompt_toolkit==3.0.48
77
+ psutil==6.1.0
78
+ ptyprocess==0.7.0
79
+ pure_eval==0.2.3
80
+ pycosat==0.6.6
81
+ pycparser==2.22
82
+ Pygments==2.18.0
83
+ python-etcd==0.4.5
84
+ pytz==2024.2
85
+ referencing==0.35.1
86
+ requests==2.32.3
87
+ rpds-py==0.20.0
88
+ ruamel.yaml==0.18.6
89
+ ruamel.yaml.clib==0.2.8
90
+ six==1.16.0
91
+ sortedcontainers==2.4.0
92
+ soupsieve==2.5
93
+ stack-data==0.6.2
94
+ sympy==1.13.1
95
+ torchaudio==2.5.1+cu124
96
+ torchelastic==0.2.2
97
+ tqdm==4.66.5
98
+ traitlets==5.14.3
99
+ triton==3.1.0
100
+ truststore==0.9.2
101
+ types-dataclasses==0.6.6
102
+ urllib3==2.2.3
103
+ wcwidth==0.2.13
104
+ zipp==3.20.2
105
+ zstandard==0.23.0
106
+ numpy==1.24.4
107
+ imgcat==0.6.0
108
+ decord==0.6.0
109
+ flash_attn==2.5.8
110
+ contourpy==1.3.2
111
+ cycler==0.12.1
112
+ fonttools==4.61.1
113
+ huggingface-hub==0.29.1
114
+ kiwisolver==1.4.9
115
+ matplotlib==3.7.0
116
+ opencv-python==4.7.0.72
117
+ pyarrow==11.0.0
118
+ pyparsing==3.2.5
119
+ safetensors==0.4.5
120
+ scipy==1.10.1
121
+ sentencepiece==0.1.99
122
+ torch==2.5.1
123
+ torchvision==0.20.1
124
+ transformers==4.49.0
125
+ pip==25.3
126
+ setuptools==80.9.0
127
+ wheel==0.45.1
128
+ Pebble==5.1.3
129
+ accelerate==1.12.0
130
+ addftool==0.2.13
131
+ aiohappyeyeballs==2.6.1
132
+ aiohttp==3.13.2
133
+ aiohttp-cors==0.8.1
134
+ aiosignal==1.4.0
135
+ airportsdata==20250909
136
+ annotated-doc==0.0.4
137
+ annotated-types==0.7.0
138
+ antlr4-python3-runtime==4.9.3
139
+ bcrypt==5.0.0
140
+ blobfile==3.0.0
141
+ cffi==2.0.0
142
+ cloudpickle==3.1.2
143
+ codetiming==1.4.0
144
+ colorful==0.5.8
145
+ compressed-tensors==0.12.2
146
+ cryptography==46.0.3
147
+ cuda-bindings==13.1.1
148
+ cuda-pathfinder==1.3.3
149
+ cuda-python==13.1.1
150
+ datasets==4.4.1
151
+ Deprecated==1.3.1
152
+ diskcache==5.6.3
153
+ distlib==0.4.0
154
+ docstring_parser==0.17.0
155
+ easydict==1.13
156
+ fabric==3.2.2
157
+ fastapi==0.124.4
158
+ fire==0.7.1
159
+ flashinfer-python==0.2.5
160
+ frozenlist==1.8.0
161
+ gevent==25.9.1
162
+ gitdb==4.0.12
163
+ GitPython==3.1.45
164
+ google-api-core==2.28.1
165
+ google-auth==2.43.0
166
+ google-cloud-aiplatform==1.130.0
167
+ google-cloud-bigquery==3.38.0
168
+ google-cloud-core==2.5.0
169
+ google-cloud-resource-manager==1.15.0
170
+ google-cloud-storage==3.7.0
171
+ google-crc32c==1.7.1
172
+ google-genai==1.55.0
173
+ google-resumable-media==2.8.0
174
+ googleapis-common-protos==1.72.0
175
+ greenlet==3.3.0
176
+ grpc-google-iam-v1==0.14.3
177
+ grpcio==1.76.0
178
+ grpcio-status==1.76.0
179
+ hf_transfer==0.1.9
180
+ hf-xet==1.2.0
181
+ hydra-core==1.3.2
182
+ importlib_metadata==8.7.0
183
+ interegular==0.3.3
184
+ invoke==2.2.1
185
+ jiter==0.12.0
186
+ joblib==1.5.2
187
+ jsonlines==4.0.0
188
+ lark==1.3.1
189
+ latex2sympy2==1.5.4
190
+ latex2sympy2_extended==1.10.2
191
+ libtmux==0.52.1
192
+ llguidance==0.7.30
193
+ loguru==0.7.3
194
+ lxml==6.0.2
195
+ math-verify==0.8.0
196
+ modelscope==1.33.0
197
+ msgpack==1.1.2
198
+ msgspec==0.20.0
199
+ multidict==6.7.0
200
+ multiprocess==0.70.18
201
+ nvidia-cusparselt-cu12==0.6.2
202
+ nvidia-ml-py==13.590.44
203
+ omegaconf==2.3.0
204
+ openai==2.11.0
205
+ opencensus==0.11.4
206
+ opencensus-context==0.1.3
207
+ opentelemetry-api==1.39.1
208
+ opentelemetry-exporter-prometheus==0.60b1
209
+ opentelemetry-proto==1.39.1
210
+ opentelemetry-sdk==1.39.1
211
+ opentelemetry-semantic-conventions==0.60b1
212
+ orjson==3.11.5
213
+ outlines==0.1.11
214
+ outlines_core==0.1.26
215
+ packaging==25.0
216
+ pandas==2.3.3
217
+ parallel-ssh==2.16.0.post1
218
+ paramiko==4.0.0
219
+ partial-json-parser==0.2.1.1.post7
220
+ peft==0.18.0
221
+ propcache==0.4.1
222
+ proto-plus==1.26.1
223
+ protobuf==6.33.2
224
+ py-spy==0.4.1
225
+ pyasn1==0.6.1
226
+ pyasn1_modules==0.4.2
227
+ pybind11==3.0.1
228
+ pycountry==24.6.1
229
+ pycryptodomex==3.23.0
230
+ pydantic==2.12.5
231
+ pydantic_core==2.41.5
232
+ pylatexenc==2.10
233
+ PyNaCl==1.6.1
234
+ pynvml==13.0.1
235
+ python-multipart==0.0.20
236
+ ray==2.52.1
237
+ regex==2025.11.3
238
+ rsa==4.9.1
239
+ scikit-learn==1.8.0
240
+ sentence-transformers==5.2.0
241
+ sentry-sdk==2.47.0
242
+ setproctitle==1.3.7
243
+ sgl-kernel==0.1.4
244
+ sglang==0.4.6.post5
245
+ shapely==2.1.2
246
+ smart_open==7.5.0
247
+ smmap==5.0.2
248
+ sniffio==1.3.1
249
+ soundfile==0.13.1
250
+ ssh2-python==1.2.0.post1
251
+ ssh-python==1.2.0.post1
252
+ starlette==0.50.0
253
+ tabulate==0.9.0
254
+ tenacity==9.1.2
255
+ tensorboardX==2.6.4
256
+ tensordict==0.6.2
257
+ termcolor==3.2.0
258
+ threadpoolctl==3.6.0
259
+ tiktoken==0.12.0
260
+ timeout-decorator==0.5.0
261
+ tmuxp==1.61.0
262
+ tokenizers==0.21.4
263
+ torch_memory_saver==0.0.9
264
+ torchao==0.9.0
265
+ torchdata==0.11.0
266
+ typing-inspection==0.4.2
267
+ uvicorn==0.38.0
268
+ uvloop==0.22.1
269
+ virtualenv==20.35.4
270
+ wandb==0.23.1
271
+ websockets==15.0.1
272
+ word2number==1.1
273
+ wrapt==2.0.1
274
+ xgrammar==0.1.19
275
+ xxhash==3.6.0
276
+ yarl==1.22.0
277
+ zope.event==6.1
278
+ zope.interface==8.1.1
279
+ cachetools==6.2.3
280
+ dill==0.4.0
281
+ inflect==7.5.0
282
+ lazy_loader==0.4
283
+ rp==0.1.1333
284
+ stackprinter==0.2.12
285
+ typeguard==4.4.4
286
+ typing_extensions==4.15.0
287
+ asciinema==2.4.0
288
+ einops==0.8.1
289
+ Send2Trash==1.8.3
290
+ anyio==4.12.0
291
+ argon2-cffi==25.1.0
292
+ argon2-cffi-bindings==25.1.0
293
+ arrow==1.4.0
294
+ async-lru==2.0.5
295
+ babel==2.17.0
296
+ bleach==6.3.0
297
+ comm==0.2.3
298
+ debugpy==1.8.18
299
+ defusedxml==0.7.1
300
+ fastjsonschema==2.21.2
301
+ fqdn==1.5.1
302
+ h11==0.16.0
303
+ httpcore==1.0.9
304
+ httpx==0.28.1
305
+ ipykernel==7.1.0
306
+ isoduration==20.11.0
307
+ json5==0.12.1
308
+ jupyter_client==8.7.0
309
+ jupyter_core==5.9.1
310
+ jupyter-events==0.12.0
311
+ jupyter-lsp==2.3.0
312
+ jupyter_server==2.17.0
313
+ jupyter_server_terminals==0.5.3
314
+ jupyterlab==4.5.0
315
+ jupyterlab_pygments==0.3.0
316
+ jupyterlab_server==2.28.0
317
+ mistune==3.1.4
318
+ nbclient==0.10.2
319
+ nbconvert==7.16.6
320
+ nbformat==5.10.4
321
+ nest-asyncio==1.6.0
322
+ notebook_shim==0.2.4
323
+ overrides==7.7.0
324
+ pandocfilters==1.5.1
325
+ prometheus_client==0.23.1
326
+ python-dateutil==2.9.0.post0
327
+ python-json-logger==4.0.0
328
+ pyzmq==27.1.0
329
+ rfc3339-validator==0.1.4
330
+ rfc3986-validator==0.1.1
331
+ terminado==0.18.1
332
+ tinycss2==1.4.0
333
+ tornado==6.5.3
334
+ tzdata==2025.3
335
+ uri-template==1.3.0
336
+ webcolors==25.10.0
337
+ webencodings==0.5.1
338
+ websocket-client==1.9.0
339
+ autocommand==2.2.2
340
+ backports.tarfile==1.2.0
341
+ importlib_metadata==8.0.0
342
+ inflect==7.3.1
343
+ jaraco.collections==5.1.0
344
+ jaraco.context==5.3.0
345
+ jaraco.functools==4.0.1
346
+ jaraco.text==3.12.1
347
+ more-itertools==10.3.0
348
+ packaging==24.2
349
+ platformdirs==4.2.2
350
+ tomli==2.0.1
351
+ typeguard==4.3.0
352
+ typing_extensions==4.12.2
353
+ wheel==0.45.1
354
+ zipp==3.19.2
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-01-04T09:32:17.157035079Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmphkc9n1om/port-20932.txt","pid":20932,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-01-04T09:32:17.158915538Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":20932}
3
+ {"time":"2026-01-04T09:32:17.158917928Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-20932-21162-3167463651/socket","Net":"unix"}}
4
+ {"time":"2026-01-04T09:32:17.342986695Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-01-04T09:32:17.351333861Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
6
+ {"time":"2026-01-04T09:32:17.578454587Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
7
+ {"time":"2026-01-04T09:39:28.832808185Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-01-04T09:32:17.359914174Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2026-01-04T09:32:17.578184558Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-01-04T09:32:17.578260453Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
+ {"time":"2026-01-04T09:32:17.578444715Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
+ {"time":"2026-01-04T09:32:17.578428925Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
+ {"time":"2026-01-04T09:32:17.578480061Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
+ {"time":"2026-01-04T09:32:17.578493724Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
+ {"time":"2026-01-04T09:32:17.579316636Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_setup.py:_flush():80] Configure stats pid to 20932
3
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
+ 2026-01-04 09:32:16,949 INFO MainThread:20932 [wandb_init.py:init():841] calling init triggers
9
+ 2026-01-04 09:32:16,950 INFO MainThread:20932 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2026-01-04 09:32:16,950 INFO MainThread:20932 [wandb_init.py:init():889] starting backend
12
+ 2026-01-04 09:32:17,342 INFO MainThread:20932 [wandb_init.py:init():892] sending inform_init request
13
+ 2026-01-04 09:32:17,349 INFO MainThread:20932 [wandb_init.py:init():900] backend started and connected
14
+ 2026-01-04 09:32:17,351 INFO MainThread:20932 [wandb_init.py:init():970] updated telemetry
15
+ 2026-01-04 09:32:17,359 INFO MainThread:20932 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2026-01-04 09:32:17,581 INFO MainThread:20932 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2026-01-04 09:32:17,942 INFO MainThread:20932 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2026-01-04 09:32:17,942 INFO MainThread:20932 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2026-01-04 09:32:17,942 INFO MainThread:20932 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2026-01-04 09:32:17,942 INFO MainThread:20932 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2026-01-04 09:32:17,945 INFO MainThread:20932 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2026-01-04 09:32:17,947 INFO MainThread:20932 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
23
+ 2026-01-04 09:32:17,948 INFO MainThread:20932 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
+ 2026-01-04 09:32:17,949 INFO MainThread:20932 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_093216-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7cb17e00f6acbbd7e6d5a4de0ff7353f427c0b6a094cae28f28c4fdb6148f7e
3
+ size 131072
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Brotli==1.1.0
2
+ MarkupSafe==3.0.2
3
+ PySocks==1.7.1
4
+ PyYAML==6.0.2
5
+ archspec==0.2.3
6
+ asttokens==2.4.1
7
+ astunparse==1.6.3
8
+ attrs==24.2.0
9
+ beautifulsoup4==4.12.3
10
+ boltons==24.0.0
11
+ certifi==2024.8.30
12
+ chardet==5.2.0
13
+ charset-normalizer==3.4.0
14
+ click==8.1.7
15
+ colorama==0.4.6
16
+ conda==24.9.2
17
+ conda-build==24.9.0
18
+ conda_index==0.5.0
19
+ conda-libmamba-solver==24.9.0
20
+ conda-package-handling==2.4.0
21
+ conda_package_streaming==0.11.0
22
+ decorator==5.1.1
23
+ distro==1.9.0
24
+ dnspython==2.7.0
25
+ exceptiongroup==1.2.2
26
+ executing==2.1.0
27
+ expecttest==0.2.1
28
+ filelock==3.16.1
29
+ frozendict==2.4.6
30
+ fsspec==2024.10.0
31
+ h2==4.1.0
32
+ hpack==4.0.0
33
+ hyperframe==6.0.1
34
+ hypothesis==6.115.5
35
+ idna==3.10
36
+ importlib_resources==6.4.5
37
+ ipython==8.29.0
38
+ jedi==0.19.1
39
+ Jinja2==3.1.4
40
+ jsonpatch==1.33
41
+ jsonpointer==3.0.0
42
+ jsonschema==4.23.0
43
+ jsonschema-specifications==2024.10.1
44
+ libarchive-c==5.1
45
+ libmambapy==1.5.10
46
+ lief==0.14.1
47
+ lintrunner==0.12.5
48
+ mamba==1.5.10
49
+ matplotlib-inline==0.1.7
50
+ menuinst==2.1.2
51
+ more-itertools==10.5.0
52
+ mpmath==1.3.0
53
+ networkx==3.4.2
54
+ ninja==1.11.1.1
55
+ nvidia-cublas-cu12==12.4.5.8
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cuda-nvrtc-cu12==12.4.127
58
+ nvidia-cuda-runtime-cu12==12.4.127
59
+ nvidia-cudnn-cu12==9.1.0.70
60
+ nvidia-cufft-cu12==11.2.1.3
61
+ nvidia-curand-cu12==10.3.5.147
62
+ nvidia-cusolver-cu12==11.6.1.9
63
+ nvidia-cusparse-cu12==12.3.1.170
64
+ nvidia-nccl-cu12==2.21.5
65
+ nvidia-nvjitlink-cu12==12.4.127
66
+ nvidia-nvtx-cu12==12.4.127
67
+ optree==0.13.0
68
+ parso==0.8.4
69
+ pexpect==4.9.0
70
+ pickleshare==0.7.5
71
+ pillow==10.2.0
72
+ pkginfo==1.11.2
73
+ pkgutil_resolve_name==1.3.10
74
+ platformdirs==4.3.6
75
+ pluggy==1.5.0
76
+ prompt_toolkit==3.0.48
77
+ psutil==6.1.0
78
+ ptyprocess==0.7.0
79
+ pure_eval==0.2.3
80
+ pycosat==0.6.6
81
+ pycparser==2.22
82
+ Pygments==2.18.0
83
+ python-etcd==0.4.5
84
+ pytz==2024.2
85
+ referencing==0.35.1
86
+ requests==2.32.3
87
+ rpds-py==0.20.0
88
+ ruamel.yaml==0.18.6
89
+ ruamel.yaml.clib==0.2.8
90
+ six==1.16.0
91
+ sortedcontainers==2.4.0
92
+ soupsieve==2.5
93
+ stack-data==0.6.2
94
+ sympy==1.13.1
95
+ torchaudio==2.5.1+cu124
96
+ torchelastic==0.2.2
97
+ tqdm==4.66.5
98
+ traitlets==5.14.3
99
+ triton==3.1.0
100
+ truststore==0.9.2
101
+ types-dataclasses==0.6.6
102
+ urllib3==2.2.3
103
+ wcwidth==0.2.13
104
+ zipp==3.20.2
105
+ zstandard==0.23.0
106
+ numpy==1.24.4
107
+ imgcat==0.6.0
108
+ decord==0.6.0
109
+ flash_attn==2.5.8
110
+ contourpy==1.3.2
111
+ cycler==0.12.1
112
+ fonttools==4.61.1
113
+ huggingface-hub==0.29.1
114
+ kiwisolver==1.4.9
115
+ matplotlib==3.7.0
116
+ opencv-python==4.7.0.72
117
+ pyarrow==11.0.0
118
+ pyparsing==3.2.5
119
+ safetensors==0.4.5
120
+ scipy==1.10.1
121
+ sentencepiece==0.1.99
122
+ torch==2.5.1
123
+ torchvision==0.20.1
124
+ transformers==4.49.0
125
+ pip==25.3
126
+ setuptools==80.9.0
127
+ wheel==0.45.1
128
+ Pebble==5.1.3
129
+ accelerate==1.12.0
130
+ addftool==0.2.13
131
+ aiohappyeyeballs==2.6.1
132
+ aiohttp==3.13.2
133
+ aiohttp-cors==0.8.1
134
+ aiosignal==1.4.0
135
+ airportsdata==20250909
136
+ annotated-doc==0.0.4
137
+ annotated-types==0.7.0
138
+ antlr4-python3-runtime==4.9.3
139
+ bcrypt==5.0.0
140
+ blobfile==3.0.0
141
+ cffi==2.0.0
142
+ cloudpickle==3.1.2
143
+ codetiming==1.4.0
144
+ colorful==0.5.8
145
+ compressed-tensors==0.12.2
146
+ cryptography==46.0.3
147
+ cuda-bindings==13.1.1
148
+ cuda-pathfinder==1.3.3
149
+ cuda-python==13.1.1
150
+ datasets==4.4.1
151
+ Deprecated==1.3.1
152
+ diskcache==5.6.3
153
+ distlib==0.4.0
154
+ docstring_parser==0.17.0
155
+ easydict==1.13
156
+ fabric==3.2.2
157
+ fastapi==0.124.4
158
+ fire==0.7.1
159
+ flashinfer-python==0.2.5
160
+ frozenlist==1.8.0
161
+ gevent==25.9.1
162
+ gitdb==4.0.12
163
+ GitPython==3.1.45
164
+ google-api-core==2.28.1
165
+ google-auth==2.43.0
166
+ google-cloud-aiplatform==1.130.0
167
+ google-cloud-bigquery==3.38.0
168
+ google-cloud-core==2.5.0
169
+ google-cloud-resource-manager==1.15.0
170
+ google-cloud-storage==3.7.0
171
+ google-crc32c==1.7.1
172
+ google-genai==1.55.0
173
+ google-resumable-media==2.8.0
174
+ googleapis-common-protos==1.72.0
175
+ greenlet==3.3.0
176
+ grpc-google-iam-v1==0.14.3
177
+ grpcio==1.76.0
178
+ grpcio-status==1.76.0
179
+ hf_transfer==0.1.9
180
+ hf-xet==1.2.0
181
+ hydra-core==1.3.2
182
+ importlib_metadata==8.7.0
183
+ interegular==0.3.3
184
+ invoke==2.2.1
185
+ jiter==0.12.0
186
+ joblib==1.5.2
187
+ jsonlines==4.0.0
188
+ lark==1.3.1
189
+ latex2sympy2==1.5.4
190
+ latex2sympy2_extended==1.10.2
191
+ libtmux==0.52.1
192
+ llguidance==0.7.30
193
+ loguru==0.7.3
194
+ lxml==6.0.2
195
+ math-verify==0.8.0
196
+ modelscope==1.33.0
197
+ msgpack==1.1.2
198
+ msgspec==0.20.0
199
+ multidict==6.7.0
200
+ multiprocess==0.70.18
201
+ nvidia-cusparselt-cu12==0.6.2
202
+ nvidia-ml-py==13.590.44
203
+ omegaconf==2.3.0
204
+ openai==2.11.0
205
+ opencensus==0.11.4
206
+ opencensus-context==0.1.3
207
+ opentelemetry-api==1.39.1
208
+ opentelemetry-exporter-prometheus==0.60b1
209
+ opentelemetry-proto==1.39.1
210
+ opentelemetry-sdk==1.39.1
211
+ opentelemetry-semantic-conventions==0.60b1
212
+ orjson==3.11.5
213
+ outlines==0.1.11
214
+ outlines_core==0.1.26
215
+ packaging==25.0
216
+ pandas==2.3.3
217
+ parallel-ssh==2.16.0.post1
218
+ paramiko==4.0.0
219
+ partial-json-parser==0.2.1.1.post7
220
+ peft==0.18.0
221
+ propcache==0.4.1
222
+ proto-plus==1.26.1
223
+ protobuf==6.33.2
224
+ py-spy==0.4.1
225
+ pyasn1==0.6.1
226
+ pyasn1_modules==0.4.2
227
+ pybind11==3.0.1
228
+ pycountry==24.6.1
229
+ pycryptodomex==3.23.0
230
+ pydantic==2.12.5
231
+ pydantic_core==2.41.5
232
+ pylatexenc==2.10
233
+ PyNaCl==1.6.1
234
+ pynvml==13.0.1
235
+ python-multipart==0.0.20
236
+ ray==2.52.1
237
+ regex==2025.11.3
238
+ rsa==4.9.1
239
+ scikit-learn==1.8.0
240
+ sentence-transformers==5.2.0
241
+ sentry-sdk==2.47.0
242
+ setproctitle==1.3.7
243
+ sgl-kernel==0.1.4
244
+ sglang==0.4.6.post5
245
+ shapely==2.1.2
246
+ smart_open==7.5.0
247
+ smmap==5.0.2
248
+ sniffio==1.3.1
249
+ soundfile==0.13.1
250
+ ssh2-python==1.2.0.post1
251
+ ssh-python==1.2.0.post1
252
+ starlette==0.50.0
253
+ tabulate==0.9.0
254
+ tenacity==9.1.2
255
+ tensorboardX==2.6.4
256
+ tensordict==0.6.2
257
+ termcolor==3.2.0
258
+ threadpoolctl==3.6.0
259
+ tiktoken==0.12.0
260
+ timeout-decorator==0.5.0
261
+ tmuxp==1.61.0
262
+ tokenizers==0.21.4
263
+ torch_memory_saver==0.0.9
264
+ torchao==0.9.0
265
+ torchdata==0.11.0
266
+ typing-inspection==0.4.2
267
+ uvicorn==0.38.0
268
+ uvloop==0.22.1
269
+ virtualenv==20.35.4
270
+ wandb==0.23.1
271
+ websockets==15.0.1
272
+ word2number==1.1
273
+ wrapt==2.0.1
274
+ xgrammar==0.1.19
275
+ xxhash==3.6.0
276
+ yarl==1.22.0
277
+ zope.event==6.1
278
+ zope.interface==8.1.1
279
+ cachetools==6.2.3
280
+ dill==0.4.0
281
+ inflect==7.5.0
282
+ lazy_loader==0.4
283
+ rp==0.1.1333
284
+ stackprinter==0.2.12
285
+ typeguard==4.4.4
286
+ typing_extensions==4.15.0
287
+ asciinema==2.4.0
288
+ einops==0.8.1
289
+ Send2Trash==1.8.3
290
+ anyio==4.12.0
291
+ argon2-cffi==25.1.0
292
+ argon2-cffi-bindings==25.1.0
293
+ arrow==1.4.0
294
+ async-lru==2.0.5
295
+ babel==2.17.0
296
+ bleach==6.3.0
297
+ comm==0.2.3
298
+ debugpy==1.8.18
299
+ defusedxml==0.7.1
300
+ fastjsonschema==2.21.2
301
+ fqdn==1.5.1
302
+ h11==0.16.0
303
+ httpcore==1.0.9
304
+ httpx==0.28.1
305
+ ipykernel==7.1.0
306
+ isoduration==20.11.0
307
+ json5==0.12.1
308
+ jupyter_client==8.7.0
309
+ jupyter_core==5.9.1
310
+ jupyter-events==0.12.0
311
+ jupyter-lsp==2.3.0
312
+ jupyter_server==2.17.0
313
+ jupyter_server_terminals==0.5.3
314
+ jupyterlab==4.5.0
315
+ jupyterlab_pygments==0.3.0
316
+ jupyterlab_server==2.28.0
317
+ mistune==3.1.4
318
+ nbclient==0.10.2
319
+ nbconvert==7.16.6
320
+ nbformat==5.10.4
321
+ nest-asyncio==1.6.0
322
+ notebook_shim==0.2.4
323
+ overrides==7.7.0
324
+ pandocfilters==1.5.1
325
+ prometheus_client==0.23.1
326
+ python-dateutil==2.9.0.post0
327
+ python-json-logger==4.0.0
328
+ pyzmq==27.1.0
329
+ rfc3339-validator==0.1.4
330
+ rfc3986-validator==0.1.1
331
+ terminado==0.18.1
332
+ tinycss2==1.4.0
333
+ tornado==6.5.3
334
+ tzdata==2025.3
335
+ uri-template==1.3.0
336
+ webcolors==25.10.0
337
+ webencodings==0.5.1
338
+ websocket-client==1.9.0
339
+ autocommand==2.2.2
340
+ backports.tarfile==1.2.0
341
+ importlib_metadata==8.0.0
342
+ inflect==7.3.1
343
+ jaraco.collections==5.1.0
344
+ jaraco.context==5.3.0
345
+ jaraco.functools==4.0.1
346
+ jaraco.text==3.12.1
347
+ more-itertools==10.3.0
348
+ packaging==24.2
349
+ platformdirs==4.2.2
350
+ tomli==2.0.1
351
+ typeguard==4.3.0
352
+ typing_extensions==4.12.2
353
+ wheel==0.45.1
354
+ zipp==3.19.2
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-01-04T09:41:58.611687494Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpml8qx73j/port-49730.txt","pid":49730,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-01-04T09:41:58.613969345Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":49730}
3
+ {"time":"2026-01-04T09:41:58.613955135Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-49730-49896-3226128572/socket","Net":"unix"}}
4
+ {"time":"2026-01-04T09:41:58.796729122Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-01-04T09:41:58.807276343Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
6
+ {"time":"2026-01-04T09:41:59.539163963Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
7
+ {"time":"2026-01-04T09:48:17.195652917Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-01-04T09:41:58.817034119Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2026-01-04T09:41:59.538864599Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-01-04T09:41:59.538957051Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
+ {"time":"2026-01-04T09:41:59.539033043Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
+ {"time":"2026-01-04T09:41:59.539155721Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
+ {"time":"2026-01-04T09:41:59.539184904Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
+ {"time":"2026-01-04T09:41:59.539218617Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
+ {"time":"2026-01-04T09:41:59.539987313Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Configure stats pid to 49730
3
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-01-04 09:41:58,406 INFO MainThread:49730 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:init():841] calling init triggers
9
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2026-01-04 09:41:58,407 INFO MainThread:49730 [wandb_init.py:init():889] starting backend
12
+ 2026-01-04 09:41:58,797 INFO MainThread:49730 [wandb_init.py:init():892] sending inform_init request
13
+ 2026-01-04 09:41:58,805 INFO MainThread:49730 [wandb_init.py:init():900] backend started and connected
14
+ 2026-01-04 09:41:58,807 INFO MainThread:49730 [wandb_init.py:init():970] updated telemetry
15
+ 2026-01-04 09:41:58,816 INFO MainThread:49730 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2026-01-04 09:41:59,542 INFO MainThread:49730 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2026-01-04 09:41:59,917 INFO MainThread:49730 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2026-01-04 09:41:59,918 INFO MainThread:49730 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2026-01-04 09:41:59,918 INFO MainThread:49730 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2026-01-04 09:41:59,918 INFO MainThread:49730 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2026-01-04 09:41:59,921 INFO MainThread:49730 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2026-01-04 09:41:59,922 INFO MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 10, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
23
+ 2026-01-04 09:41:59,923 INFO MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
+ 2026-01-04 09:41:59,924 INFO MainThread:49730 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only_test/wandb/offline-run-20260104_094158-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fea8db2c722c608b691a06b3e692272a70eb2fb6d6fdaddefd3d9b5b178335ba
3
+ size 131072