Junyi42 commited on
Commit
2f3bbdc
·
verified ·
1 Parent(s): ea4653e

Upload checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only

Browse files
Files changed (9) hide show
  1. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/debug-internal.log +8 -13
  2. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/debug.log +23 -27
  3. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20251230_194213-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log +175 -197
  4. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260102_214304-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log +266 -179
  5. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt +354 -0
  6. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log +7 -0
  7. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log +8 -0
  8. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log +24 -0
  9. checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb +0 -0
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/debug-internal.log CHANGED
@@ -1,13 +1,8 @@
1
- {"time":"2026-01-02T21:43:05.63786614Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
- {"time":"2026-01-02T21:43:06.061238769Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
- {"time":"2026-01-02T21:43:06.061320164Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
- {"time":"2026-01-02T21:43:06.061627231Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
- {"time":"2026-01-02T21:43:06.061722049Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
- {"time":"2026-01-02T21:43:06.061752682Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
- {"time":"2026-01-02T21:43:06.061765788Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
- {"time":"2026-01-02T21:43:06.062871122Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
9
- {"time":"2026-01-03T17:12:43.962447603Z","level":"INFO","msg":"handler: operation stats","stats":{}}
10
- {"time":"2026-01-03T17:12:43.96605726Z","level":"INFO","msg":"stream: closing","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
11
- {"time":"2026-01-03T17:12:43.966072728Z","level":"INFO","msg":"handler: closed","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
12
- {"time":"2026-01-03T17:12:43.966105326Z","level":"INFO","msg":"sender: closed","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
13
- {"time":"2026-01-03T17:12:43.966121948Z","level":"INFO","msg":"stream: closed","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
 
1
+ {"time":"2026-01-04T09:32:01.115692117Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2026-01-04T09:32:01.44318528Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-01-04T09:32:01.443259623Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
+ {"time":"2026-01-04T09:32:01.443726951Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
+ {"time":"2026-01-04T09:32:01.44372382Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
+ {"time":"2026-01-04T09:32:01.443767451Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
+ {"time":"2026-01-04T09:32:01.443797476Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
+ {"time":"2026-01-04T09:32:01.444535545Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
 
 
 
 
 
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/debug.log CHANGED
@@ -1,28 +1,24 @@
1
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_setup.py:_flush():80] Configure stats pid to 13227
3
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260102_214304-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260102_214304-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_init.py:init():841] calling init triggers
9
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
  config: {'_wandb': {}}
11
- 2026-01-02 21:43:04,973 INFO MainThread:13227 [wandb_init.py:init():889] starting backend
12
- 2026-01-02 21:43:05,456 INFO MainThread:13227 [wandb_init.py:init():892] sending inform_init request
13
- 2026-01-02 21:43:05,464 INFO MainThread:13227 [wandb_init.py:init():900] backend started and connected
14
- 2026-01-02 21:43:05,466 INFO MainThread:13227 [wandb_init.py:init():970] updated telemetry
15
- 2026-01-02 21:43:05,474 INFO MainThread:13227 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
- 2026-01-02 21:43:06,064 INFO MainThread:13227 [wandb_init.py:init():1041] starting run threads in backend
17
- 2026-01-02 21:43:06,498 INFO MainThread:13227 [wandb_run.py:_console_start():2521] atexit reg
18
- 2026-01-02 21:43:06,498 INFO MainThread:13227 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
- 2026-01-02 21:43:06,498 INFO MainThread:13227 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
- 2026-01-02 21:43:06,498 INFO MainThread:13227 [wandb_run.py:_redirect():2461] Redirects installed.
21
- 2026-01-02 21:43:06,502 INFO MainThread:13227 [wandb_init.py:init():1081] run started, returning control to user process
22
- 2026-01-02 21:43:06,504 INFO MainThread:13227 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'viz_every': 500, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'save_ema_only': True, 'save_optimizer': False}
23
- 2026-01-02 21:43:06,505 INFO MainThread:13227 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
- 2026-01-02 21:43:06,505 INFO MainThread:13227 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
25
- 2026-01-03 17:12:43,959 INFO MainThread:13227 [wandb_run.py:_finish():2287] finishing run bagel/vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0
26
- 2026-01-03 17:12:43,960 INFO MainThread:13227 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
27
- 2026-01-03 17:12:43,960 INFO MainThread:13227 [wandb_run.py:_restore():2468] restore
28
- 2026-01-03 17:12:43,960 INFO MainThread:13227 [wandb_run.py:_restore():2474] restore done
 
1
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Configure stats pid to 13221
3
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_init.py:init():841] calling init triggers
9
+ 2026-01-04 09:32:00,480 INFO MainThread:13221 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
  config: {'_wandb': {}}
11
+ 2026-01-04 09:32:00,480 INFO MainThread:13221 [wandb_init.py:init():889] starting backend
12
+ 2026-01-04 09:32:00,939 INFO MainThread:13221 [wandb_init.py:init():892] sending inform_init request
13
+ 2026-01-04 09:32:00,947 INFO MainThread:13221 [wandb_init.py:init():900] backend started and connected
14
+ 2026-01-04 09:32:00,949 INFO MainThread:13221 [wandb_init.py:init():970] updated telemetry
15
+ 2026-01-04 09:32:00,956 INFO MainThread:13221 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2026-01-04 09:32:01,446 INFO MainThread:13221 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2026-01-04 09:32:01,968 INFO MainThread:13221 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2026-01-04 09:32:01,969 INFO MainThread:13221 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'eval_log_dir': None, 'eval_run_tag': '', 'viz_every': 500, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'eval_print_n': 3, 'save_ema_only': True, 'save_optimizer': False}
23
+ 2026-01-04 09:32:01,970 INFO MainThread:13221 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
+ 2026-01-04 09:32:01,971 INFO MainThread:13221 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
 
 
 
 
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20251230_194213-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log CHANGED
@@ -1,176 +1,3 @@
1
- [2025-12-30 23:03:51] (step=0000844) Train Loss mse: 0.0270, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
2
- FullyShardedDataParallel(
3
- (_fsdp_wrapped_module): Bagel(
4
- (language_model): Qwen2ForCausalLM(
5
- (model): Qwen2Model(
6
- (embed_tokens): Embedding(152064, 3584)
7
- (layers): ModuleList(
8
- (0-27): 28 x FullyShardedDataParallel(
9
- (_fsdp_wrapped_module): CheckpointWrapper(
10
- (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
11
- (self_attn): PackedAttentionMoT(
12
- (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
13
- (k_proj): Linear(in_features=3584, out_features=512, bias=True)
14
- (v_proj): Linear(in_features=3584, out_features=512, bias=True)
15
- (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
16
- (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
17
- (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
18
- (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
19
- (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
20
- (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
21
- (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
22
- (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
23
- (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
24
- )
25
- (mlp): Qwen2MLP(
26
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
27
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
28
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
29
- (act_fn): SiLU()
30
- )
31
- (mlp_moe_gen): Qwen2MLP(
32
- (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
33
- (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
34
- (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
35
- (act_fn): SiLU()
36
- )
37
- (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
38
- (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
39
- (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
40
- (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
41
- )
42
- )
43
- )
44
- )
45
- (norm): Qwen2RMSNorm((3584,), eps=1e-06)
46
- (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
47
- (rotary_emb): Qwen2RotaryEmbedding()
48
- )
49
- (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
50
- )
51
- (time_embedder): FullyShardedDataParallel(
52
- (_fsdp_wrapped_module): TimestepEmbedder(
53
- (mlp): Sequential(
54
- (0): Linear(in_features=256, out_features=3584, bias=True)
55
- (1): SiLU()
56
- (2): Linear(in_features=3584, out_features=3584, bias=True)
57
- )
58
- )
59
- )
60
- (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
61
- (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
62
- (latent_pos_embed): FullyShardedDataParallel(
63
- (_fsdp_wrapped_module): PositionEmbedding()
64
- )
65
- (vit_model): SiglipVisionModel(
66
- (vision_model): FullyShardedDataParallel(
67
- (_fsdp_wrapped_module): SiglipVisionTransformer(
68
- (embeddings): SiglipVisionEmbeddings(
69
- (position_embedding): Embedding(4900, 1152)
70
- (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
71
- )
72
- (encoder): SiglipEncoder(
73
- (layers): ModuleList(
74
- (0-25): 26 x FullyShardedDataParallel(
75
- (_fsdp_wrapped_module): CheckpointWrapper(
76
- (_checkpoint_wrapped_module): SiglipEncoderLayer(
77
- (self_attn): SiglipFlashAttention2(
78
- (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
79
- (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
80
- (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
81
- (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
82
- )
83
- (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
84
- (mlp): SiglipMLP(
85
- (activation_fn): PytorchGELUTanh()
86
- (fc1): Linear(in_features=1152, out_features=4304, bias=True)
87
- (fc2): Linear(in_features=4304, out_features=1152, bias=True)
88
- )
89
- (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
90
- )
91
- )
92
- )
93
- )
94
- )
95
- (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
96
- )
97
- )
98
- )
99
- (connector): FullyShardedDataParallel(
100
- (_fsdp_wrapped_module): CheckpointWrapper(
101
- (_checkpoint_wrapped_module): MLPconnector(
102
- (activation_fn): PytorchGELUTanh()
103
- (fc1): Linear(in_features=1152, out_features=3584, bias=True)
104
- (fc2): Linear(in_features=3584, out_features=3584, bias=True)
105
- )
106
- )
107
- )
108
- (vit_pos_embed): FullyShardedDataParallel(
109
- (_fsdp_wrapped_module): PositionEmbedding()
110
- )
111
- )
112
- )
113
- _flat_param True
114
- language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
115
- language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
116
- language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
117
- language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
118
- language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
119
- language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
120
- language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
121
- language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
122
- language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
123
- language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
124
- language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
125
- language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
126
- language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
127
- language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
128
- language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
129
- language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
130
- language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
131
- language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
132
- language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
133
- language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
134
- language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
135
- language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
136
- language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
137
- language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
138
- language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
139
- language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
140
- language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
141
- language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
142
- time_embedder._fsdp_wrapped_module._flat_param True
143
- latent_pos_embed._fsdp_wrapped_module._flat_param False
144
- vit_model.vision_model._fsdp_wrapped_module._flat_param True
145
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
146
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
147
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
148
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
149
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
150
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
151
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
152
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
153
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
154
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
155
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
156
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
157
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
158
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
159
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
160
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
161
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
162
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
163
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
164
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
165
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
166
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
167
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
168
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
169
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
170
- vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
171
- connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
172
- vit_pos_embed._fsdp_wrapped_module._flat_param False
173
- Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
174
  wandb: Detected [huggingface_hub.inference] in use.
175
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
176
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
@@ -1007,6 +834,179 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1007
  [2025-12-30 22:59:02] (step=0000823) Train Loss mse: 0.0429, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1008
  [2025-12-30 22:59:18] (step=0000824) Train Loss mse: 0.0383, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1009
  [2025-12-30 22:59:30] (step=0000825) Train Loss mse: 0.0483, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1010
  [2025-12-30 22:59:42] (step=0000826) Train Loss mse: 0.0364, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1011
  [2025-12-30 22:59:55] (step=0000827) Train Loss mse: 0.0403, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1012
  [2025-12-30 23:00:08] (step=0000828) Train Loss mse: 0.0478, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
@@ -1025,7 +1025,7 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
1025
  [2025-12-30 23:03:06] (step=0000841) Train Loss mse: 0.0491, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1026
  [2025-12-30 23:03:19] (step=0000842) Train Loss mse: 0.0546, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1027
  [2025-12-30 23:03:35] (step=0000843) Train Loss mse: 0.0530, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1028
- Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1029
  [2025-12-30 23:04:04] (step=0000845) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1030
  [2025-12-30 23:04:17] (step=0000846) Train Loss mse: 0.0424, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1031
  [2025-12-30 23:04:33] (step=0000847) Train Loss mse: 0.0337, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
@@ -3132,29 +3132,7 @@ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
3132
  [2025-12-31 07:13:08] (step=0002945) Train Loss mse: 0.0254, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3133
  [2025-12-31 07:13:21] (step=0002946) Train Loss mse: 0.0386, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3134
  [2025-12-31 07:13:34] (step=0002947) Train Loss mse: 0.0290, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3135
- [2025-12-31 07:13:48] (step=0002948) Train Loss mse: 0.0196, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3136
- [2025-12-31 07:14:03] (step=0002949) Train Loss mse: 0.0272, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3137
- [2025-12-31 07:14:19] (step=0002950) Train Loss mse: 0.0222, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3138
- [2025-12-31 07:14:29] (step=0002951) Train Loss mse: 0.0206, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
3139
- [2025-12-31 07:14:41] (step=0002952) Train Loss mse: 0.0373, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3140
- [2025-12-31 07:14:54] (step=0002953) Train Loss mse: 0.0368, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3141
- [2025-12-31 07:15:08] (step=0002954) Train Loss mse: 0.0234, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3142
- [2025-12-31 07:15:24] (step=0002955) Train Loss mse: 0.0337, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3143
- [2025-12-31 07:15:37] (step=0002956) Train Loss mse: 0.0295, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3144
- [2025-12-31 07:15:53] (step=0002957) Train Loss mse: 0.0307, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3145
- [2025-12-31 07:16:06] (step=0002958) Train Loss mse: 0.0301, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3146
- [2025-12-31 07:16:17] (step=0002959) Train Loss mse: 0.0331, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3147
- [2025-12-31 07:16:33] (step=0002960) Train Loss mse: 0.0245, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3148
- [2025-12-31 07:16:47] (step=0002961) Train Loss mse: 0.0196, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3149
- [2025-12-31 07:17:00] (step=0002962) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3150
- [2025-12-31 07:17:14] (step=0002963) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3151
- [2025-12-31 07:17:30] (step=0002964) Train Loss mse: 0.0233, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3152
- [2025-12-31 07:17:46] (step=0002965) Train Loss mse: 0.0461, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3153
- [2025-12-31 07:17:57] (step=0002966) Train Loss mse: 0.0241, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3154
- [2025-12-31 07:18:10] (step=0002967) Train Loss mse: 0.0296, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3155
- [2025-12-31 07:18:24] (step=0002968) Train Loss mse: 0.0290, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3156
- [2025-12-31 07:18:40] (step=0002969) Train Loss mse: 0.0319, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3157
- [2025-12-31 07:18:56] (step=0002970) Train Loss mse: 0.0254, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3158
  [2025-12-31 07:19:10] (step=0002971) Train Loss mse: 0.0283, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3159
  [2025-12-31 07:19:22] (step=0002972) Train Loss mse: 0.0399, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3160
  [2025-12-31 07:19:38] (step=0002973) Train Loss mse: 0.0229, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  wandb: Detected [huggingface_hub.inference] in use.
2
  wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
3
  wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
 
834
  [2025-12-30 22:59:02] (step=0000823) Train Loss mse: 0.0429, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
835
  [2025-12-30 22:59:18] (step=0000824) Train Loss mse: 0.0383, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
836
  [2025-12-30 22:59:30] (step=0000825) Train Loss mse: 0.0483, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
837
+ FullyShardedDataParallel(
838
+ (_fsdp_wrapped_module): Bagel(
839
+ (language_model): Qwen2ForCausalLM(
840
+ (model): Qwen2Model(
841
+ (embed_tokens): Embedding(152064, 3584)
842
+ (layers): ModuleList(
843
+ (0-27): 28 x FullyShardedDataParallel(
844
+ (_fsdp_wrapped_module): CheckpointWrapper(
845
+ (_checkpoint_wrapped_module): Qwen2MoTDecoderLayer(
846
+ (self_attn): PackedAttentionMoT(
847
+ (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
848
+ (k_proj): Linear(in_features=3584, out_features=512, bias=True)
849
+ (v_proj): Linear(in_features=3584, out_features=512, bias=True)
850
+ (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
851
+ (q_norm): Qwen2RMSNorm((128,), eps=1e-06)
852
+ (k_norm): Qwen2RMSNorm((128,), eps=1e-06)
853
+ (q_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
854
+ (k_norm_moe_gen): Qwen2RMSNorm((128,), eps=1e-06)
855
+ (q_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=True)
856
+ (k_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
857
+ (v_proj_moe_gen): Linear(in_features=3584, out_features=512, bias=True)
858
+ (o_proj_moe_gen): Linear(in_features=3584, out_features=3584, bias=False)
859
+ )
860
+ (mlp): Qwen2MLP(
861
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
862
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
863
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
864
+ (act_fn): SiLU()
865
+ )
866
+ (mlp_moe_gen): Qwen2MLP(
867
+ (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
868
+ (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
869
+ (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
870
+ (act_fn): SiLU()
871
+ )
872
+ (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
873
+ (input_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
874
+ (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
875
+ (post_attention_layernorm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
876
+ )
877
+ )
878
+ )
879
+ )
880
+ (norm): Qwen2RMSNorm((3584,), eps=1e-06)
881
+ (norm_moe_gen): Qwen2RMSNorm((3584,), eps=1e-06)
882
+ (rotary_emb): Qwen2RotaryEmbedding()
883
+ )
884
+ (lm_head): Linear(in_features=3584, out_features=152064, bias=False)
885
+ )
886
+ (time_embedder): FullyShardedDataParallel(
887
+ (_fsdp_wrapped_module): TimestepEmbedder(
888
+ (mlp): Sequential(
889
+ (0): Linear(in_features=256, out_features=3584, bias=True)
890
+ (1): SiLU()
891
+ (2): Linear(in_features=3584, out_features=3584, bias=True)
892
+ )
893
+ )
894
+ )
895
+ (vae2llm): Linear(in_features=64, out_features=3584, bias=True)
896
+ (llm2vae): Linear(in_features=3584, out_features=64, bias=True)
897
+ (latent_pos_embed): FullyShardedDataParallel(
898
+ (_fsdp_wrapped_module): PositionEmbedding()
899
+ )
900
+ (vit_model): SiglipVisionModel(
901
+ (vision_model): FullyShardedDataParallel(
902
+ (_fsdp_wrapped_module): SiglipVisionTransformer(
903
+ (embeddings): SiglipVisionEmbeddings(
904
+ (position_embedding): Embedding(4900, 1152)
905
+ (patch_embedding): Linear(in_features=588, out_features=1152, bias=True)
906
+ )
907
+ (encoder): SiglipEncoder(
908
+ (layers): ModuleList(
909
+ (0-25): 26 x FullyShardedDataParallel(
910
+ (_fsdp_wrapped_module): CheckpointWrapper(
911
+ (_checkpoint_wrapped_module): SiglipEncoderLayer(
912
+ (self_attn): SiglipFlashAttention2(
913
+ (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
914
+ (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
915
+ (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
916
+ (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
917
+ )
918
+ (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
919
+ (mlp): SiglipMLP(
920
+ (activation_fn): PytorchGELUTanh()
921
+ (fc1): Linear(in_features=1152, out_features=4304, bias=True)
922
+ (fc2): Linear(in_features=4304, out_features=1152, bias=True)
923
+ )
924
+ (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
925
+ )
926
+ )
927
+ )
928
+ )
929
+ )
930
+ (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
931
+ )
932
+ )
933
+ )
934
+ (connector): FullyShardedDataParallel(
935
+ (_fsdp_wrapped_module): CheckpointWrapper(
936
+ (_checkpoint_wrapped_module): MLPconnector(
937
+ (activation_fn): PytorchGELUTanh()
938
+ (fc1): Linear(in_features=1152, out_features=3584, bias=True)
939
+ (fc2): Linear(in_features=3584, out_features=3584, bias=True)
940
+ )
941
+ )
942
+ )
943
+ (vit_pos_embed): FullyShardedDataParallel(
944
+ (_fsdp_wrapped_module): PositionEmbedding()
945
+ )
946
+ )
947
+ )
948
+ _flat_param True
949
+ language_model.model.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
950
+ language_model.model.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
951
+ language_model.model.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
952
+ language_model.model.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
953
+ language_model.model.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
954
+ language_model.model.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
955
+ language_model.model.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
956
+ language_model.model.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
957
+ language_model.model.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
958
+ language_model.model.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
959
+ language_model.model.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
960
+ language_model.model.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
961
+ language_model.model.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
962
+ language_model.model.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
963
+ language_model.model.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
964
+ language_model.model.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
965
+ language_model.model.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
966
+ language_model.model.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
967
+ language_model.model.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
968
+ language_model.model.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
969
+ language_model.model.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
970
+ language_model.model.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
971
+ language_model.model.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
972
+ language_model.model.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
973
+ language_model.model.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
974
+ language_model.model.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
975
+ language_model.model.layers.26._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
976
+ language_model.model.layers.27._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
977
+ time_embedder._fsdp_wrapped_module._flat_param True
978
+ latent_pos_embed._fsdp_wrapped_module._flat_param False
979
+ vit_model.vision_model._fsdp_wrapped_module._flat_param True
980
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.0._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
981
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.1._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
982
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.2._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
983
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.3._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
984
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.4._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
985
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.5._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
986
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.6._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
987
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.7._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
988
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.8._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
989
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.9._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
990
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.10._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
991
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.11._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
992
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.12._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
993
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.13._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
994
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.14._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
995
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.15._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
996
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.16._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
997
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.17._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
998
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.18._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
999
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.19._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1000
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.20._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1001
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.21._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1002
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.22._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1003
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.23._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1004
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.24._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1005
+ vit_model.vision_model._fsdp_wrapped_module.encoder.layers.25._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1006
+ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
1007
+ vit_pos_embed._fsdp_wrapped_module._flat_param False
1008
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1009
+ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1010
  [2025-12-30 22:59:42] (step=0000826) Train Loss mse: 0.0364, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1011
  [2025-12-30 22:59:55] (step=0000827) Train Loss mse: 0.0403, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1012
  [2025-12-30 23:00:08] (step=0000828) Train Loss mse: 0.0478, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 
1025
  [2025-12-30 23:03:06] (step=0000841) Train Loss mse: 0.0491, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1026
  [2025-12-30 23:03:19] (step=0000842) Train Loss mse: 0.0546, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1027
  [2025-12-30 23:03:35] (step=0000843) Train Loss mse: 0.0530, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1028
+ [2025-12-30 23:03:51] (step=0000844) Train Loss mse: 0.0270, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1029
  [2025-12-30 23:04:04] (step=0000845) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1030
  [2025-12-30 23:04:17] (step=0000846) Train Loss mse: 0.0424, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1031
  [2025-12-30 23:04:33] (step=0000847) Train Loss mse: 0.0337, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 
3132
  [2025-12-31 07:13:08] (step=0002945) Train Loss mse: 0.0254, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3133
  [2025-12-31 07:13:21] (step=0002946) Train Loss mse: 0.0386, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3134
  [2025-12-31 07:13:34] (step=0002947) Train Loss mse: 0.0290, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3135
+ [2025-12-31 07:13:48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3136
  [2025-12-31 07:19:10] (step=0002971) Train Loss mse: 0.0283, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3137
  [2025-12-31 07:19:22] (step=0002972) Train Loss mse: 0.0399, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3138
  [2025-12-31 07:19:38] (step=0002973) Train Loss mse: 0.0229, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260102_214304-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/output.log CHANGED
@@ -747,6 +747,121 @@ wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/
747
  [2026-01-03 00:39:54] (step=0000736) Train Loss mse: 0.0265, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
748
  [2026-01-03 00:40:11] (step=0000737) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
749
  [2026-01-03 00:40:24] (step=0000738) Train Loss mse: 0.0433, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
750
  FullyShardedDataParallel(
751
  (_fsdp_wrapped_module): Bagel(
752
  (language_model): Qwen2ForCausalLM(
@@ -920,121 +1035,6 @@ connector._fsdp_wrapped_module._checkpoint_wrapped_module._flat_param True
920
  vit_pos_embed._fsdp_wrapped_module._flat_param False
921
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
922
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
923
- [2026-01-03 00:40:37] (step=0000739) Train Loss mse: 0.0488, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
924
- [2026-01-03 00:40:50] (step=0000740) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
925
- [2026-01-03 00:41:04] (step=0000741) Train Loss mse: 0.0518, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
926
- [2026-01-03 00:41:16] (step=0000742) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
927
- [2026-01-03 00:41:27] (step=0000743) Train Loss mse: 0.0542, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
928
- [2026-01-03 00:41:41] (step=0000744) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
929
- [2026-01-03 00:41:53] (step=0000745) Train Loss mse: 0.0629, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
930
- [2026-01-03 00:42:09] (step=0000746) Train Loss mse: 0.0451, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
931
- [2026-01-03 00:42:23] (step=0000747) Train Loss mse: 0.0331, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
932
- [2026-01-03 00:42:36] (step=0000748) Train Loss mse: 0.0387, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
933
- [2026-01-03 00:42:47] (step=0000749) Train Loss mse: 0.0353, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
934
- [2026-01-03 00:42:59] (step=0000750) Train Loss mse: 0.0455, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
935
- [2026-01-03 00:43:12] (step=0000751) Train Loss mse: 0.0436, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
936
- [2026-01-03 00:43:25] (step=0000752) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
937
- [2026-01-03 00:43:41] (step=0000753) Train Loss mse: 0.0404, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
938
- [2026-01-03 00:43:51] (step=0000754) Train Loss mse: 0.0379, Train Loss ce: 0.0000, Train Steps/Sec: 0.10,
939
- [2026-01-03 00:44:04] (step=0000755) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
940
- [2026-01-03 00:44:20] (step=0000756) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
941
- [2026-01-03 00:44:36] (step=0000757) Train Loss mse: 0.0436, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
942
- [2026-01-03 00:44:52] (step=0000758) Train Loss mse: 0.0460, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
943
- [2026-01-03 00:45:05] (step=0000759) Train Loss mse: 0.0317, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
944
- [2026-01-03 00:45:22] (step=0000760) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
945
- [2026-01-03 00:45:35] (step=0000761) Train Loss mse: 0.0383, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
946
- [2026-01-03 00:45:46] (step=0000762) Train Loss mse: 0.0434, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
947
- [2026-01-03 00:46:03] (step=0000763) Train Loss mse: 0.0465, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
948
- [2026-01-03 00:46:14] (step=0000764) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
949
- [2026-01-03 00:46:27] (step=0000765) Train Loss mse: 0.0423, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
950
- [2026-01-03 00:46:40] (step=0000766) Train Loss mse: 0.0428, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
951
- [2026-01-03 00:46:52] (step=0000767) Train Loss mse: 0.0392, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
952
- [2026-01-03 00:47:06] (step=0000768) Train Loss mse: 0.0506, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
953
- [2026-01-03 00:47:22] (step=0000769) Train Loss mse: 0.0424, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
954
- [2026-01-03 00:47:34] (step=0000770) Train Loss mse: 0.0405, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
955
- [2026-01-03 00:47:49] (step=0000771) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
956
- [2026-01-03 00:48:02] (step=0000772) Train Loss mse: 0.0469, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
957
- [2026-01-03 00:48:15] (step=0000773) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
958
- [2026-01-03 00:48:28] (step=0000774) Train Loss mse: 0.0431, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
959
- [2026-01-03 00:48:44] (step=0000775) Train Loss mse: 0.0369, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
960
- [2026-01-03 00:49:00] (step=0000776) Train Loss mse: 0.0357, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
961
- [2026-01-03 00:49:14] (step=0000777) Train Loss mse: 0.0466, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
962
- [2026-01-03 00:49:30] (step=0000778) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
963
- [2026-01-03 00:49:46] (step=0000779) Train Loss mse: 0.0452, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
964
- [2026-01-03 00:49:56] (step=0000780) Train Loss mse: 0.0635, Train Loss ce: 0.0000, Train Steps/Sec: 0.10,
965
- [2026-01-03 00:50:08] (step=0000781) Train Loss mse: 0.0540, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
966
- [2026-01-03 00:50:24] (step=0000782) Train Loss mse: 0.0468, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
967
- [2026-01-03 00:50:36] (step=0000783) Train Loss mse: 0.0439, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
968
- [2026-01-03 00:50:48] (step=0000784) Train Loss mse: 0.0506, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
969
- [2026-01-03 00:51:00] (step=0000785) Train Loss mse: 0.0376, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
970
- [2026-01-03 00:51:16] (step=0000786) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
971
- [2026-01-03 00:51:29] (step=0000787) Train Loss mse: 0.0364, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
972
- [2026-01-03 00:51:41] (step=0000788) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
973
- [2026-01-03 00:51:57] (step=0000789) Train Loss mse: 0.0470, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
974
- [2026-01-03 00:52:13] (step=0000790) Train Loss mse: 0.0311, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
975
- [2026-01-03 00:52:27] (step=0000791) Train Loss mse: 0.0399, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
976
- [2026-01-03 00:52:43] (step=0000792) Train Loss mse: 0.0250, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
977
- [2026-01-03 00:52:55] (step=0000793) Train Loss mse: 0.0513, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
978
- [2026-01-03 00:53:06] (step=0000794) Train Loss mse: 0.0426, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
979
- [2026-01-03 00:53:20] (step=0000795) Train Loss mse: 0.0637, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
980
- [2026-01-03 00:53:33] (step=0000796) Train Loss mse: 0.0504, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
981
- [2026-01-03 00:53:48] (step=0000797) Train Loss mse: 0.0467, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
982
- [2026-01-03 00:54:00] (step=0000798) Train Loss mse: 0.0493, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
983
- [2026-01-03 00:54:16] (step=0000799) Train Loss mse: 0.0400, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
984
- [2026-01-03 00:54:32] (step=0000800) Train Loss mse: 0.0519, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
985
- [2026-01-03 00:54:44] (step=0000801) Train Loss mse: 0.0510, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
986
- [2026-01-03 00:54:58] (step=0000802) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
987
- [2026-01-03 00:55:09] (step=0000803) Train Loss mse: 0.0396, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
988
- [2026-01-03 00:55:19] (step=0000804) Train Loss mse: 0.0541, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
989
- [2026-01-03 00:55:32] (step=0000805) Train Loss mse: 0.0452, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
990
- [2026-01-03 00:55:43] (step=0000806) Train Loss mse: 0.0527, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
991
- [2026-01-03 00:55:56] (step=0000807) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
992
- [2026-01-03 00:56:13] (step=0000808) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
993
- [2026-01-03 00:56:26] (step=0000809) Train Loss mse: 0.0419, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
994
- [2026-01-03 00:56:39] (step=0000810) Train Loss mse: 0.0485, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
995
- [2026-01-03 00:56:53] (step=0000811) Train Loss mse: 0.0351, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
996
- [2026-01-03 00:57:09] (step=0000812) Train Loss mse: 0.0273, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
997
- [2026-01-03 00:57:25] (step=0000813) Train Loss mse: 0.0412, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
998
- [2026-01-03 00:57:37] (step=0000814) Train Loss mse: 0.0527, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
999
- [2026-01-03 00:57:53] (step=0000815) Train Loss mse: 0.0316, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1000
- [2026-01-03 00:58:06] (step=0000816) Train Loss mse: 0.0294, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1001
- [2026-01-03 00:58:23] (step=0000817) Train Loss mse: 0.0337, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1002
- [2026-01-03 00:58:39] (step=0000818) Train Loss mse: 0.0415, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1003
- [2026-01-03 00:58:52] (step=0000819) Train Loss mse: 0.0356, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1004
- [2026-01-03 00:59:05] (step=0000820) Train Loss mse: 0.0428, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1005
- [2026-01-03 00:59:19] (step=0000821) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1006
- [2026-01-03 00:59:33] (step=0000822) Train Loss mse: 0.0497, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1007
- [2026-01-03 00:59:46] (step=0000823) Train Loss mse: 0.0428, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1008
- [2026-01-03 01:00:02] (step=0000824) Train Loss mse: 0.0385, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1009
- [2026-01-03 01:00:13] (step=0000825) Train Loss mse: 0.0499, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1010
- [2026-01-03 01:00:25] (step=0000826) Train Loss mse: 0.0360, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1011
- [2026-01-03 01:00:39] (step=0000827) Train Loss mse: 0.0415, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1012
- [2026-01-03 01:00:52] (step=0000828) Train Loss mse: 0.0479, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1013
- [2026-01-03 01:01:05] (step=0000829) Train Loss mse: 0.0361, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1014
- [2026-01-03 01:01:17] (step=0000830) Train Loss mse: 0.0509, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1015
- [2026-01-03 01:01:30] (step=0000831) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1016
- [2026-01-03 01:01:43] (step=0000832) Train Loss mse: 0.0365, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1017
- [2026-01-03 01:01:59] (step=0000833) Train Loss mse: 0.0427, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1018
- [2026-01-03 01:02:11] (step=0000834) Train Loss mse: 0.0505, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1019
- [2026-01-03 01:02:24] (step=0000835) Train Loss mse: 0.0533, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1020
- [2026-01-03 01:02:40] (step=0000836) Train Loss mse: 0.0256, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1021
- [2026-01-03 01:02:53] (step=0000837) Train Loss mse: 0.0375, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1022
- [2026-01-03 01:03:09] (step=0000838) Train Loss mse: 0.0386, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1023
- [2026-01-03 01:03:22] (step=0000839) Train Loss mse: 0.0483, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1024
- [2026-01-03 01:03:34] (step=0000840) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1025
- [2026-01-03 01:03:50] (step=0000841) Train Loss mse: 0.0493, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1026
- [2026-01-03 01:04:04] (step=0000842) Train Loss mse: 0.0548, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1027
- [2026-01-03 01:04:20] (step=0000843) Train Loss mse: 0.0532, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1028
- [2026-01-03 01:04:36] (step=0000844) Train Loss mse: 0.0271, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1029
- [2026-01-03 01:04:48] (step=0000845) Train Loss mse: 0.0427, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1030
- [2026-01-03 01:05:02] (step=0000846) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
1031
- [2026-01-03 01:05:18] (step=0000847) Train Loss mse: 0.0339, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1032
- [2026-01-03 01:05:34] (step=0000848) Train Loss mse: 0.0311, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1033
- [2026-01-03 01:05:50] (step=0000849) Train Loss mse: 0.0434, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
1034
- [2026-01-03 01:06:01] (step=0000850) Train Loss mse: 0.0379, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1035
- [2026-01-03 01:06:14] (step=0000851) Train Loss mse: 0.0447, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1036
- [2026-01-03 01:06:26] (step=0000852) Train Loss mse: 0.0516, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1037
- [2026-01-03 01:06:38] (step=0000853) Train Loss mse: 0.0351, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1038
  [2026-01-03 01:06:51] (step=0000854) Train Loss mse: 0.0607, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1039
  [2026-01-03 01:07:02] (step=0000855) Train Loss mse: 0.0458, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1040
  [2026-01-03 01:07:15] (step=0000856) Train Loss mse: 0.0375, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
@@ -3098,7 +3098,86 @@ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
3098
  [2026-01-03 09:06:11] (step=0002911) Train Loss mse: 0.0372, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3099
  [2026-01-03 09:06:24] (step=0002912) Train Loss mse: 0.0293, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3100
  [2026-01-03 09:06:37] (step=0002913) Train Loss mse: 0.0295, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3101
- [2026-01-03 09:06:54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3102
  [2026-01-03 09:25:26] (step=0002994) Train Loss mse: 0.0343, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3103
  [2026-01-03 09:25:38] (step=0002995) Train Loss mse: 0.0318, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3104
  [2026-01-03 09:25:51] (step=0002996) Train Loss mse: 0.0319, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
@@ -4109,67 +4188,76 @@ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
4109
  [2026-01-03 13:19:24] (step=0004001) Train Loss mse: 0.0321, Train Loss ce: 0.0000, Train Steps/Sec: 0.01,
4110
  [2026-01-03 13:19:40] (step=0004002) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4111
  [2026-01-03 13:19:53] (step=0004003) Train Loss mse: 0.0293, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4112
- [2026-01-03 09:10:47] (step=0002932) Train Loss mse: 0.0272, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4113
- [2026-01-03 09:11:00] (step=0002933) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4114
- [2026-01-03 09:11:13] (step=0002934) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4115
- [2026-01-03 09:11:30] (step=0002935) Train Loss mse: 0.0287, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4116
- [2026-01-03 09:11:46] (step=0002936) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4117
- [2026-01-03 09:12:03] (step=0002937) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4118
- [2026-01-03 09:12:16] (step=0002938) Train Loss mse: 0.0239, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4119
- [2026-01-03 09:12:28] (step=0002939) Train Loss mse: 0.0395, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4120
- [2026-01-03 09:12:44] (step=0002940) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4121
- [2026-01-03 09:12:57] (step=0002941) Train Loss mse: 0.0327, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4122
- [2026-01-03 09:13:10] (step=0002942) Train Loss mse: 0.0221, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4123
- [2026-01-03 09:13:26] (step=0002943) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4124
- [2026-01-03 09:13:41] (step=0002944) Train Loss mse: 0.0258, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4125
- [2026-01-03 09:13:55] (step=0002945) Train Loss mse: 0.0255, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4126
- [2026-01-03 09:14:07] (step=0002946) Train Loss mse: 0.0385, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4127
- [2026-01-03 09:14:21] (step=0002947) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4128
- [2026-01-03 09:14:35] (step=0002948) Train Loss mse: 0.0198, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4129
- [2026-01-03 09:14:50] (step=0002949) Train Loss mse: 0.0257, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4130
- [2026-01-03 09:15:06] (step=0002950) Train Loss mse: 0.0224, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4131
- [2026-01-03 09:15:16] (step=0002951) Train Loss mse: 0.0208, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
4132
- [2026-01-03 09:15:29] (step=0002952) Train Loss mse: 0.0373, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4133
- [2026-01-03 09:15:41] (step=0002953) Train Loss mse: 0.0369, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4134
- [2026-01-03 09:15:55] (step=0002954) Train Loss mse: 0.0235, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4135
- [2026-01-03 09:16:11] (step=0002955) Train Loss mse: 0.0317, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4136
- [2026-01-03 09:16:24] (step=0002956) Train Loss mse: 0.0285, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4137
- [2026-01-03 09:16:40] (step=0002957) Train Loss mse: 0.0308, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4138
- [2026-01-03 09:16:54] (step=0002958) Train Loss mse: 0.0306, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4139
- [2026-01-03 09:17:05] (step=0002959) Train Loss mse: 0.0333, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4140
- [2026-01-03 09:17:22] (step=0002960) Train Loss mse: 0.0246, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4141
- [2026-01-03 09:17:35] (step=0002961) Train Loss mse: 0.0196, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4142
- [2026-01-03 09:17:49] (step=0002962) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4143
- [2026-01-03 09:18:02] (step=0002963) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4144
- [2026-01-03 09:18:18] (step=0002964) Train Loss mse: 0.0235, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4145
- [2026-01-03 09:18:34] (step=0002965) Train Loss mse: 0.0461, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4146
- [2026-01-03 09:18:45] (step=0002966) Train Loss mse: 0.0278, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4147
- [2026-01-03 09:18:58] (step=0002967) Train Loss mse: 0.0296, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4148
- [2026-01-03 09:19:12] (step=0002968) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4149
- [2026-01-03 09:19:28] (step=0002969) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4150
- [2026-01-03 09:19:44] (step=0002970) Train Loss mse: 0.0255, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4151
- [2026-01-03 09:19:57] (step=0002971) Train Loss mse: 0.0284, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4152
- [2026-01-03 09:20:10] (step=0002972) Train Loss mse: 0.0411, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4153
- [2026-01-03 09:20:26] (step=0002973) Train Loss mse: 0.0230, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4154
- [2026-01-03 09:20:43] (step=0002974) Train Loss mse: 0.0307, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4155
- [2026-01-03 09:20:56] (step=0002975) Train Loss mse: 0.0432, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4156
- [2026-01-03 09:21:12] (step=0002976) Train Loss mse: 0.0236, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4157
- [2026-01-03 09:21:23] (step=0002977) Train Loss mse: 0.0258, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4158
- [2026-01-03 09:21:40] (step=0002978) Train Loss mse: 0.0384, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4159
- [2026-01-03 09:21:53] (step=0002979) Train Loss mse: 0.0200, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4160
- [2026-01-03 09:22:06] (step=0002980) Train Loss mse: 0.0263, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4161
- [2026-01-03 09:22:18] (step=0002981) Train Loss mse: 0.0279, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4162
- [2026-01-03 09:22:34] (step=0002982) Train Loss mse: 0.0299, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4163
- [2026-01-03 09:22:50] (step=0002983) Train Loss mse: 0.0211, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4164
- [2026-01-03 09:23:03] (step=0002984) Train Loss mse: 0.0263, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4165
- [2026-01-03 09:23:17] (step=0002985) Train Loss mse: 0.0286, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4166
- [2026-01-03 09:23:30] (step=0002986) Train Loss mse: 0.0498, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4167
- [2026-01-03 09:23:43] (step=0002987) Train Loss mse: 0.0259, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4168
- [2026-01-03 09:23:56] (step=0002988) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4169
- [2026-01-03 09:24:09] (step=0002989) Train Loss mse: 0.0224, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4170
- [2026-01-03 09:24:25] (step=0002990) Train Loss mse: 0.0231, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4171
- [2026-01-03 09:24:41] (step=0002991) Train Loss mse: 0.0408, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4172
- [2026-01-03 09:24:57] (step=0002992) Train Loss mse: 0.0458, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
 
 
 
 
 
 
 
 
 
4173
  [2026-01-03 13:36:13] (step=0004074) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4174
  [2026-01-03 13:36:25] (step=0004075) Train Loss mse: 0.0329, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4175
  [2026-01-03 13:36:38] (step=0004076) Train Loss mse: 0.0261, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
@@ -5098,5 +5186,4 @@ Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
5098
  [2026-01-03 17:08:32] (step=0004999) Train Loss mse: 0.0250, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
5099
  [2026-01-03 17:08:48] (step=0005000) Train Loss mse: 0.0230, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
5100
  [2026-01-03 17:10:07] Saving checkpoint to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/0005000.
5101
- [2026-01-03 17:12:43] Done!
5102
- [2026-01-03 13:36:13] (step=0004074) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
 
747
  [2026-01-03 00:39:54] (step=0000736) Train Loss mse: 0.0265, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
748
  [2026-01-03 00:40:11] (step=0000737) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
749
  [2026-01-03 00:40:24] (step=0000738) Train Loss mse: 0.0433, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
750
+ [2026-01-03 00:40:37] (step=0000739) Train Loss mse: 0.0488, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
751
+ [2026-01-03 00:40:50] (step=0000740) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
752
+ [2026-01-03 00:41:04] (step=0000741) Train Loss mse: 0.0518, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
753
+ [2026-01-03 00:41:16] (step=0000742) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
754
+ [2026-01-03 00:41:27] (step=0000743) Train Loss mse: 0.0542, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
755
+ [2026-01-03 00:41:41] (step=0000744) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
756
+ [2026-01-03 00:41:53] (step=0000745) Train Loss mse: 0.0629, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
757
+ [2026-01-03 00:42:09] (step=0000746) Train Loss mse: 0.0451, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
758
+ [2026-01-03 00:42:23] (step=0000747) Train Loss mse: 0.0331, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
759
+ [2026-01-03 00:42:36] (step=0000748) Train Loss mse: 0.0387, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
760
+ [2026-01-03 00:42:47] (step=0000749) Train Loss mse: 0.0353, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
761
+ [2026-01-03 00:42:59] (step=0000750) Train Loss mse: 0.0455, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
762
+ [2026-01-03 00:43:12] (step=0000751) Train Loss mse: 0.0436, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
763
+ [2026-01-03 00:43:25] (step=0000752) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
764
+ [2026-01-03 00:43:41] (step=0000753) Train Loss mse: 0.0404, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
765
+ [2026-01-03 00:43:51] (step=0000754) Train Loss mse: 0.0379, Train Loss ce: 0.0000, Train Steps/Sec: 0.10,
766
+ [2026-01-03 00:44:04] (step=0000755) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
767
+ [2026-01-03 00:44:20] (step=0000756) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
768
+ [2026-01-03 00:44:36] (step=0000757) Train Loss mse: 0.0436, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
769
+ [2026-01-03 00:44:52] (step=0000758) Train Loss mse: 0.0460, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
770
+ [2026-01-03 00:45:05] (step=0000759) Train Loss mse: 0.0317, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
771
+ [2026-01-03 00:45:22] (step=0000760) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
772
+ [2026-01-03 00:45:35] (step=0000761) Train Loss mse: 0.0383, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
773
+ [2026-01-03 00:45:46] (step=0000762) Train Loss mse: 0.0434, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
774
+ [2026-01-03 00:46:03] (step=0000763) Train Loss mse: 0.0465, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
775
+ [2026-01-03 00:46:14] (step=0000764) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
776
+ [2026-01-03 00:46:27] (step=0000765) Train Loss mse: 0.0423, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
777
+ [2026-01-03 00:46:40] (step=0000766) Train Loss mse: 0.0428, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
778
+ [2026-01-03 00:46:52] (step=0000767) Train Loss mse: 0.0392, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
779
+ [2026-01-03 00:47:06] (step=0000768) Train Loss mse: 0.0506, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
780
+ [2026-01-03 00:47:22] (step=0000769) Train Loss mse: 0.0424, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
781
+ [2026-01-03 00:47:34] (step=0000770) Train Loss mse: 0.0405, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
782
+ [2026-01-03 00:47:49] (step=0000771) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
783
+ [2026-01-03 00:48:02] (step=0000772) Train Loss mse: 0.0469, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
784
+ [2026-01-03 00:48:15] (step=0000773) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
785
+ [2026-01-03 00:48:28] (step=0000774) Train Loss mse: 0.0431, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
786
+ [2026-01-03 00:48:44] (step=0000775) Train Loss mse: 0.0369, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
787
+ [2026-01-03 00:49:00] (step=0000776) Train Loss mse: 0.0357, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
788
+ [2026-01-03 00:49:14] (step=0000777) Train Loss mse: 0.0466, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
789
+ [2026-01-03 00:49:30] (step=0000778) Train Loss mse: 0.0367, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
790
+ [2026-01-03 00:49:46] (step=0000779) Train Loss mse: 0.0452, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
791
+ [2026-01-03 00:49:56] (step=0000780) Train Loss mse: 0.0635, Train Loss ce: 0.0000, Train Steps/Sec: 0.10,
792
+ [2026-01-03 00:50:08] (step=0000781) Train Loss mse: 0.0540, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
793
+ [2026-01-03 00:50:24] (step=0000782) Train Loss mse: 0.0468, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
794
+ [2026-01-03 00:50:36] (step=0000783) Train Loss mse: 0.0439, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
795
+ [2026-01-03 00:50:48] (step=0000784) Train Loss mse: 0.0506, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
796
+ [2026-01-03 00:51:00] (step=0000785) Train Loss mse: 0.0376, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
797
+ [2026-01-03 00:51:16] (step=0000786) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
798
+ [2026-01-03 00:51:29] (step=0000787) Train Loss mse: 0.0364, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
799
+ [2026-01-03 00:51:41] (step=0000788) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
800
+ [2026-01-03 00:51:57] (step=0000789) Train Loss mse: 0.0470, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
801
+ [2026-01-03 00:52:13] (step=0000790) Train Loss mse: 0.0311, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
802
+ [2026-01-03 00:52:27] (step=0000791) Train Loss mse: 0.0399, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
803
+ [2026-01-03 00:52:43] (step=0000792) Train Loss mse: 0.0250, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
804
+ [2026-01-03 00:52:55] (step=0000793) Train Loss mse: 0.0513, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
805
+ [2026-01-03 00:53:06] (step=0000794) Train Loss mse: 0.0426, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
806
+ [2026-01-03 00:53:20] (step=0000795) Train Loss mse: 0.0637, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
807
+ [2026-01-03 00:53:33] (step=0000796) Train Loss mse: 0.0504, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
808
+ [2026-01-03 00:53:48] (step=0000797) Train Loss mse: 0.0467, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
809
+ [2026-01-03 00:54:00] (step=0000798) Train Loss mse: 0.0493, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
810
+ [2026-01-03 00:54:16] (step=0000799) Train Loss mse: 0.0400, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
811
+ [2026-01-03 00:54:32] (step=0000800) Train Loss mse: 0.0519, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
812
+ [2026-01-03 00:54:44] (step=0000801) Train Loss mse: 0.0510, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
813
+ [2026-01-03 00:54:58] (step=0000802) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
814
+ [2026-01-03 00:55:09] (step=0000803) Train Loss mse: 0.0396, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
815
+ [2026-01-03 00:55:19] (step=0000804) Train Loss mse: 0.0541, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
816
+ [2026-01-03 00:55:32] (step=0000805) Train Loss mse: 0.0452, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
817
+ [2026-01-03 00:55:43] (step=0000806) Train Loss mse: 0.0527, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
818
+ [2026-01-03 00:55:56] (step=0000807) Train Loss mse: 0.0441, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
819
+ [2026-01-03 00:56:13] (step=0000808) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
820
+ [2026-01-03 00:56:26] (step=0000809) Train Loss mse: 0.0419, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
821
+ [2026-01-03 00:56:39] (step=0000810) Train Loss mse: 0.0485, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
822
+ [2026-01-03 00:56:53] (step=0000811) Train Loss mse: 0.0351, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
823
+ [2026-01-03 00:57:09] (step=0000812) Train Loss mse: 0.0273, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
824
+ [2026-01-03 00:57:25] (step=0000813) Train Loss mse: 0.0412, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
825
+ [2026-01-03 00:57:37] (step=0000814) Train Loss mse: 0.0527, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
826
+ [2026-01-03 00:57:53] (step=0000815) Train Loss mse: 0.0316, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
827
+ [2026-01-03 00:58:06] (step=0000816) Train Loss mse: 0.0294, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
828
+ [2026-01-03 00:58:23] (step=0000817) Train Loss mse: 0.0337, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
829
+ [2026-01-03 00:58:39] (step=0000818) Train Loss mse: 0.0415, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
830
+ [2026-01-03 00:58:52] (step=0000819) Train Loss mse: 0.0356, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
831
+ [2026-01-03 00:59:05] (step=0000820) Train Loss mse: 0.0428, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
832
+ [2026-01-03 00:59:19] (step=0000821) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
833
+ [2026-01-03 00:59:33] (step=0000822) Train Loss mse: 0.0497, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
834
+ [2026-01-03 00:59:46] (step=0000823) Train Loss mse: 0.0428, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
835
+ [2026-01-03 01:00:02] (step=0000824) Train Loss mse: 0.0385, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
836
+ [2026-01-03 01:00:13] (step=0000825) Train Loss mse: 0.0499, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
837
+ [2026-01-03 01:00:25] (step=0000826) Train Loss mse: 0.0360, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
838
+ [2026-01-03 01:00:39] (step=0000827) Train Loss mse: 0.0415, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
839
+ [2026-01-03 01:00:52] (step=0000828) Train Loss mse: 0.0479, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
840
+ [2026-01-03 01:01:05] (step=0000829) Train Loss mse: 0.0361, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
841
+ [2026-01-03 01:01:17] (step=0000830) Train Loss mse: 0.0509, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
842
+ [2026-01-03 01:01:30] (step=0000831) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
843
+ [2026-01-03 01:01:43] (step=0000832) Train Loss mse: 0.0365, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
844
+ [2026-01-03 01:01:59] (step=0000833) Train Loss mse: 0.0427, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
845
+ [2026-01-03 01:02:11] (step=0000834) Train Loss mse: 0.0505, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
846
+ [2026-01-03 01:02:24] (step=0000835) Train Loss mse: 0.0533, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
847
+ [2026-01-03 01:02:40] (step=0000836) Train Loss mse: 0.0256, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
848
+ [2026-01-03 01:02:53] (step=0000837) Train Loss mse: 0.0375, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
849
+ [2026-01-03 01:03:09] (step=0000838) Train Loss mse: 0.0386, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
850
+ [2026-01-03 01:03:22] (step=0000839) Train Loss mse: 0.0483, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
851
+ [2026-01-03 01:03:34] (step=0000840) Train Loss mse: 0.0459, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
852
+ [2026-01-03 01:03:50] (step=0000841) Train Loss mse: 0.0493, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
853
+ [2026-01-03 01:04:04] (step=0000842) Train Loss mse: 0.0548, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
854
+ [2026-01-03 01:04:20] (step=0000843) Train Loss mse: 0.0532, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
855
+ [2026-01-03 01:04:36] (step=0000844) Train Loss mse: 0.0271, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
856
+ [2026-01-03 01:04:48] (step=0000845) Train Loss mse: 0.0427, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
857
+ [2026-01-03 01:05:02] (step=0000846) Train Loss mse: 0.0425, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
858
+ [2026-01-03 01:05:18] (step=0000847) Train Loss mse: 0.0339, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
859
+ [2026-01-03 01:05:34] (step=0000848) Train Loss mse: 0.0311, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
860
+ [2026-01-03 01:05:50] (step=0000849) Train Loss mse: 0.0434, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
861
+ [2026-01-03 01:06:01] (step=0000850) Train Loss mse: 0.0379, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
862
+ [2026-01-03 01:06:14] (step=0000851) Train Loss mse: 0.0447, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
863
+ [2026-01-03 01:06:26] (step=0000852) Train Loss mse: 0.0516, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
864
+ [2026-01-03 01:06:38] (step=0000853) Train Loss mse: 0.0351, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
865
  FullyShardedDataParallel(
866
  (_fsdp_wrapped_module): Bagel(
867
  (language_model): Qwen2ForCausalLM(
 
1035
  vit_pos_embed._fsdp_wrapped_module._flat_param False
1036
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
1037
  Preparing Dataset vlm_gym_jigsaw_mse_loss_only/vlm_gym_jigsaw_train
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
  [2026-01-03 01:06:51] (step=0000854) Train Loss mse: 0.0607, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
1039
  [2026-01-03 01:07:02] (step=0000855) Train Loss mse: 0.0458, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
1040
  [2026-01-03 01:07:15] (step=0000856) Train Loss mse: 0.0375, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 
3098
  [2026-01-03 09:06:11] (step=0002911) Train Loss mse: 0.0372, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3099
  [2026-01-03 09:06:24] (step=0002912) Train Loss mse: 0.0293, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3100
  [2026-01-03 09:06:37] (step=0002913) Train Loss mse: 0.0295, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3101
+ [2026-01-03 09:06:54] (step=0002914) Train Loss mse: 0.0248, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3102
+ [2026-01-03 09:07:07] (step=0002915) Train Loss mse: 0.0321, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3103
+ [2026-01-03 09:07:23] (step=0002916) Train Loss mse: 0.0410, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3104
+ [2026-01-03 09:07:34] (step=0002917) Train Loss mse: 0.0278, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3105
+ [2026-01-03 09:07:48] (step=0002918) Train Loss mse: 0.0212, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3106
+ [2026-01-03 09:08:02] (step=0002919) Train Loss mse: 0.0319, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3107
+ [2026-01-03 09:08:15] (step=0002920) Train Loss mse: 0.0301, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3108
+ [2026-01-03 09:08:28] (step=0002921) Train Loss mse: 0.0250, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3109
+ [2026-01-03 09:08:42] (step=0002922) Train Loss mse: 0.0364, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3110
+ [2026-01-03 09:08:54] (step=0002923) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3111
+ [2026-01-03 09:09:07] (step=0002924) Train Loss mse: 0.0247, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3112
+ [2026-01-03 09:09:20] (step=0002925) Train Loss mse: 0.0255, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3113
+ [2026-01-03 09:09:33] (step=0002926) Train Loss mse: 0.0297, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3114
+ [2026-01-03 09:09:43] (step=0002927) Train Loss mse: 0.0442, Train Loss ce: 0.0000, Train Steps/Sec: 0.10,
3115
+ [2026-01-03 09:09:56] (step=0002928) Train Loss mse: 0.0392, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3116
+ [2026-01-03 09:10:09] (step=0002929) Train Loss mse: 0.0275, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3117
+ [2026-01-03 09:10:22] (step=0002930) Train Loss mse: 0.0327, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3118
+ [2026-01-03 09:10:34] (step=0002931) Train Loss mse: 0.0247, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3119
+ [2026-01-03 09:10:47] (step=0002932) Train Loss mse: 0.0272, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3120
+ [2026-01-03 09:11:00] (step=0002933) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3121
+ [2026-01-03 09:11:13] (step=0002934) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3122
+ [2026-01-03 09:11:30] (step=0002935) Train Loss mse: 0.0287, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3123
+ [2026-01-03 09:11:46] (step=0002936) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3124
+ [2026-01-03 09:12:03] (step=0002937) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3125
+ [2026-01-03 09:12:16] (step=0002938) Train Loss mse: 0.0239, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3126
+ [2026-01-03 09:12:28] (step=0002939) Train Loss mse: 0.0395, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3127
+ [2026-01-03 09:12:44] (step=0002940) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3128
+ [2026-01-03 09:12:57] (step=0002941) Train Loss mse: 0.0327, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3129
+ [2026-01-03 09:13:10] (step=0002942) Train Loss mse: 0.0221, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3130
+ [2026-01-03 09:13:26] (step=0002943) Train Loss mse: 0.0456, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3131
+ [2026-01-03 09:13:41] (step=0002944) Train Loss mse: 0.0258, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3132
+ [2026-01-03 09:13:55] (step=0002945) Train Loss mse: 0.0255, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3133
+ [2026-01-03 09:14:07] (step=0002946) Train Loss mse: 0.0385, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3134
+ [2026-01-03 09:14:21] (step=0002947) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3135
+ [2026-01-03 09:14:35] (step=0002948) Train Loss mse: 0.0198, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3136
+ [2026-01-03 09:14:50] (step=0002949) Train Loss mse: 0.0257, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3137
+ [2026-01-03 09:15:06] (step=0002950) Train Loss mse: 0.0224, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3138
+ [2026-01-03 09:15:16] (step=0002951) Train Loss mse: 0.0208, Train Loss ce: 0.0000, Train Steps/Sec: 0.11,
3139
+ [2026-01-03 09:15:29] (step=0002952) Train Loss mse: 0.0373, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3140
+ [2026-01-03 09:15:41] (step=0002953) Train Loss mse: 0.0369, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3141
+ [2026-01-03 09:15:55] (step=0002954) Train Loss mse: 0.0235, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3142
+ [2026-01-03 09:16:11] (step=0002955) Train Loss mse: 0.0317, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3143
+ [2026-01-03 09:16:24] (step=0002956) Train Loss mse: 0.0285, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3144
+ [2026-01-03 09:16:40] (step=0002957) Train Loss mse: 0.0308, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3145
+ [2026-01-03 09:16:54] (step=0002958) Train Loss mse: 0.0306, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3146
+ [2026-01-03 09:17:05] (step=0002959) Train Loss mse: 0.0333, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3147
+ [2026-01-03 09:17:22] (step=0002960) Train Loss mse: 0.0246, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3148
+ [2026-01-03 09:17:35] (step=0002961) Train Loss mse: 0.0196, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3149
+ [2026-01-03 09:17:49] (step=0002962) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3150
+ [2026-01-03 09:18:02] (step=0002963) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3151
+ [2026-01-03 09:18:18] (step=0002964) Train Loss mse: 0.0235, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3152
+ [2026-01-03 09:18:34] (step=0002965) Train Loss mse: 0.0461, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3153
+ [2026-01-03 09:18:45] (step=0002966) Train Loss mse: 0.0278, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3154
+ [2026-01-03 09:18:58] (step=0002967) Train Loss mse: 0.0296, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3155
+ [2026-01-03 09:19:12] (step=0002968) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3156
+ [2026-01-03 09:19:28] (step=0002969) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3157
+ [2026-01-03 09:19:44] (step=0002970) Train Loss mse: 0.0255, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3158
+ [2026-01-03 09:19:57] (step=0002971) Train Loss mse: 0.0284, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3159
+ [2026-01-03 09:20:10] (step=0002972) Train Loss mse: 0.0411, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3160
+ [2026-01-03 09:20:26] (step=0002973) Train Loss mse: 0.0230, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3161
+ [2026-01-03 09:20:43] (step=0002974) Train Loss mse: 0.0307, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3162
+ [2026-01-03 09:20:56] (step=0002975) Train Loss mse: 0.0432, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3163
+ [2026-01-03 09:21:12] (step=0002976) Train Loss mse: 0.0236, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3164
+ [2026-01-03 09:21:23] (step=0002977) Train Loss mse: 0.0258, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3165
+ [2026-01-03 09:21:40] (step=0002978) Train Loss mse: 0.0384, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3166
+ [2026-01-03 09:21:53] (step=0002979) Train Loss mse: 0.0200, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3167
+ [2026-01-03 09:22:06] (step=0002980) Train Loss mse: 0.0263, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3168
+ [2026-01-03 09:22:18] (step=0002981) Train Loss mse: 0.0279, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
3169
+ [2026-01-03 09:22:34] (step=0002982) Train Loss mse: 0.0299, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3170
+ [2026-01-03 09:22:50] (step=0002983) Train Loss mse: 0.0211, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3171
+ [2026-01-03 09:23:03] (step=0002984) Train Loss mse: 0.0263, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3172
+ [2026-01-03 09:23:17] (step=0002985) Train Loss mse: 0.0286, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3173
+ [2026-01-03 09:23:30] (step=0002986) Train Loss mse: 0.0498, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3174
+ [2026-01-03 09:23:43] (step=0002987) Train Loss mse: 0.0259, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3175
+ [2026-01-03 09:23:56] (step=0002988) Train Loss mse: 0.0292, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
3176
+ [2026-01-03 09:24:09] (step=0002989) Train Loss mse: 0.0224, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3177
+ [2026-01-03 09:24:25] (step=0002990) Train Loss mse: 0.0231, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3178
+ [2026-01-03 09:24:41] (step=0002991) Train Loss mse: 0.0408, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3179
+ [2026-01-03 09:24:57] (step=0002992) Train Loss mse: 0.0458, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3180
+ [2026-01-03 09:25:13] (step=0002993) Train Loss mse: 0.0287, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
3181
  [2026-01-03 09:25:26] (step=0002994) Train Loss mse: 0.0343, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3182
  [2026-01-03 09:25:38] (step=0002995) Train Loss mse: 0.0318, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
3183
  [2026-01-03 09:25:51] (step=0002996) Train Loss mse: 0.0319, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 
4188
  [2026-01-03 13:19:24] (step=0004001) Train Loss mse: 0.0321, Train Loss ce: 0.0000, Train Steps/Sec: 0.01,
4189
  [2026-01-03 13:19:40] (step=0004002) Train Loss mse: 0.0269, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4190
  [2026-01-03 13:19:53] (step=0004003) Train Loss mse: 0.0293, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4191
+ [2026-01-03 13:20:06] (step=0004004) Train Loss mse: 0.0326, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4192
+ [2026-01-03 13:20:22] (step=0004005) Train Loss mse: 0.0287, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4193
+ [2026-01-03 13:20:35] (step=0004006) Train Loss mse: 0.0288, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4194
+ [2026-01-03 13:20:52] (step=0004007) Train Loss mse: 0.0311, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4195
+ [2026-01-03 13:21:05] (step=0004008) Train Loss mse: 0.0316, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4196
+ [2026-01-03 13:21:17] (step=0004009) Train Loss mse: 0.0310, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4197
+ [2026-01-03 13:21:30] (step=0004010) Train Loss mse: 0.0216, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4198
+ [2026-01-03 13:21:44] (step=0004011) Train Loss mse: 0.0219, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4199
+ [2026-01-03 13:21:57] (step=0004012) Train Loss mse: 0.0231, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4200
+ [2026-01-03 13:22:08] (step=0004013) Train Loss mse: 0.0348, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4201
+ [2026-01-03 13:22:20] (step=0004014) Train Loss mse: 0.0390, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4202
+ [2026-01-03 13:22:33] (step=0004015) Train Loss mse: 0.0276, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4203
+ [2026-01-03 13:22:46] (step=0004016) Train Loss mse: 0.0321, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4204
+ [2026-01-03 13:22:58] (step=0004017) Train Loss mse: 0.0268, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4205
+ [2026-01-03 13:23:11] (step=0004018) Train Loss mse: 0.0239, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4206
+ [2026-01-03 13:23:24] (step=0004019) Train Loss mse: 0.0309, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4207
+ [2026-01-03 13:23:41] (step=0004020) Train Loss mse: 0.0306, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4208
+ [2026-01-03 13:23:57] (step=0004021) Train Loss mse: 0.0226, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4209
+ [2026-01-03 13:24:10] (step=0004022) Train Loss mse: 0.0334, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4210
+ [2026-01-03 13:24:22] (step=0004023) Train Loss mse: 0.0287, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4211
+ [2026-01-03 13:24:35] (step=0004024) Train Loss mse: 0.0285, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4212
+ [2026-01-03 13:24:51] (step=0004025) Train Loss mse: 0.0191, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4213
+ [2026-01-03 13:25:07] (step=0004026) Train Loss mse: 0.0229, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4214
+ [2026-01-03 13:25:20] (step=0004027) Train Loss mse: 0.0278, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4215
+ [2026-01-03 13:25:36] (step=0004028) Train Loss mse: 0.0277, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4216
+ [2026-01-03 13:25:52] (step=0004029) Train Loss mse: 0.0230, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4217
+ [2026-01-03 13:26:08] (step=0004030) Train Loss mse: 0.0207, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4218
+ [2026-01-03 13:26:24] (step=0004031) Train Loss mse: 0.0251, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4219
+ [2026-01-03 13:26:35] (step=0004032) Train Loss mse: 0.0274, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4220
+ [2026-01-03 13:26:49] (step=0004033) Train Loss mse: 0.0324, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4221
+ [2026-01-03 13:27:02] (step=0004034) Train Loss mse: 0.0267, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4222
+ [2026-01-03 13:27:14] (step=0004035) Train Loss mse: 0.0361, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4223
+ [2026-01-03 13:27:27] (step=0004036) Train Loss mse: 0.0190, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4224
+ [2026-01-03 13:27:42] (step=0004037) Train Loss mse: 0.0312, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4225
+ [2026-01-03 13:27:58] (step=0004038) Train Loss mse: 0.0341, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4226
+ [2026-01-03 13:28:11] (step=0004039) Train Loss mse: 0.0216, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4227
+ [2026-01-03 13:28:27] (step=0004040) Train Loss mse: 0.0233, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4228
+ [2026-01-03 13:28:40] (step=0004041) Train Loss mse: 0.0216, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4229
+ [2026-01-03 13:28:56] (step=0004042) Train Loss mse: 0.0303, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4230
+ [2026-01-03 13:29:09] (step=0004043) Train Loss mse: 0.0208, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4231
+ [2026-01-03 13:29:25] (step=0004044) Train Loss mse: 0.0203, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4232
+ [2026-01-03 13:29:41] (step=0004045) Train Loss mse: 0.0265, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4233
+ [2026-01-03 13:29:57] (step=0004046) Train Loss mse: 0.0281, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4234
+ [2026-01-03 13:30:10] (step=0004047) Train Loss mse: 0.0387, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4235
+ [2026-01-03 13:30:23] (step=0004048) Train Loss mse: 0.0330, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4236
+ [2026-01-03 13:30:37] (step=0004049) Train Loss mse: 0.0340, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4237
+ [2026-01-03 13:30:50] (step=0004050) Train Loss mse: 0.0228, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4238
+ [2026-01-03 13:31:01] (step=0004051) Train Loss mse: 0.0361, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4239
+ [2026-01-03 13:31:17] (step=0004052) Train Loss mse: 0.0259, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4240
+ [2026-01-03 13:31:30] (step=0004053) Train Loss mse: 0.0337, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4241
+ [2026-01-03 13:31:43] (step=0004054) Train Loss mse: 0.0250, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4242
+ [2026-01-03 13:31:57] (step=0004055) Train Loss mse: 0.0260, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4243
+ [2026-01-03 13:32:08] (step=0004056) Train Loss mse: 0.0335, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4244
+ [2026-01-03 13:32:20] (step=0004057) Train Loss mse: 0.0278, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4245
+ [2026-01-03 13:32:33] (step=0004058) Train Loss mse: 0.0328, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4246
+ [2026-01-03 13:32:49] (step=0004059) Train Loss mse: 0.0268, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4247
+ [2026-01-03 13:33:05] (step=0004060) Train Loss mse: 0.0228, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4248
+ [2026-01-03 13:33:19] (step=0004061) Train Loss mse: 0.0318, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4249
+ [2026-01-03 13:33:32] (step=0004062) Train Loss mse: 0.0223, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4250
+ [2026-01-03 13:33:48] (step=0004063) Train Loss mse: 0.0239, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
4251
+ [2026-01-03 13:34:03] (step=0004064) Train Loss mse: 0.0381, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4252
+ [2026-01-03 13:34:14] (step=0004065) Train Loss mse: 0.0237, Train Loss ce: 0.0000, Train Steps/Sec: 0.09,
4253
+ [2026-01-03 13:34:28] (step=0004066) Train Loss mse: 0.0223, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4254
+ [2026-01-03 13:34:41] (step=0004067) Train Loss mse: 0.0347, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4255
+ [2026-01-03 13:34:54] (step=0004068) Train Loss mse: 0.0303, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4256
+ [2026-01-03 13:35:08] (step=0004069) Train Loss mse: 0.0304, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4257
+ [2026-01-03 13:35:21] (step=0004070) Train Loss mse: 0.0254, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4258
+ [2026-01-03 13:35:35] (step=0004071) Train Loss mse: 0.0207, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4259
+ [2026-01-03 13:35:47] (step=0004072) Train Loss mse: 0.0320, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4260
+ [2026-01-03 13:35:59] (step=0004073) Train Loss mse: 0.0377, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4261
  [2026-01-03 13:36:13] (step=0004074) Train Loss mse: 0.0289, Train Loss ce: 0.0000, Train Steps/Sec: 0.07,
4262
  [2026-01-03 13:36:25] (step=0004075) Train Loss mse: 0.0329, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
4263
  [2026-01-03 13:36:38] (step=0004076) Train Loss mse: 0.0261, Train Loss ce: 0.0000, Train Steps/Sec: 0.08,
 
5186
  [2026-01-03 17:08:32] (step=0004999) Train Loss mse: 0.0250, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
5187
  [2026-01-03 17:08:48] (step=0005000) Train Loss mse: 0.0230, Train Loss ce: 0.0000, Train Steps/Sec: 0.06,
5188
  [2026-01-03 17:10:07] Saving checkpoint to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/0005000.
5189
+ [2026-01-03 17:12:43] Done!
 
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/files/requirements.txt ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Brotli==1.1.0
2
+ MarkupSafe==3.0.2
3
+ PySocks==1.7.1
4
+ PyYAML==6.0.2
5
+ archspec==0.2.3
6
+ asttokens==2.4.1
7
+ astunparse==1.6.3
8
+ attrs==24.2.0
9
+ beautifulsoup4==4.12.3
10
+ boltons==24.0.0
11
+ certifi==2024.8.30
12
+ chardet==5.2.0
13
+ charset-normalizer==3.4.0
14
+ click==8.1.7
15
+ colorama==0.4.6
16
+ conda==24.9.2
17
+ conda-build==24.9.0
18
+ conda_index==0.5.0
19
+ conda-libmamba-solver==24.9.0
20
+ conda-package-handling==2.4.0
21
+ conda_package_streaming==0.11.0
22
+ decorator==5.1.1
23
+ distro==1.9.0
24
+ dnspython==2.7.0
25
+ exceptiongroup==1.2.2
26
+ executing==2.1.0
27
+ expecttest==0.2.1
28
+ filelock==3.16.1
29
+ frozendict==2.4.6
30
+ fsspec==2024.10.0
31
+ h2==4.1.0
32
+ hpack==4.0.0
33
+ hyperframe==6.0.1
34
+ hypothesis==6.115.5
35
+ idna==3.10
36
+ importlib_resources==6.4.5
37
+ ipython==8.29.0
38
+ jedi==0.19.1
39
+ Jinja2==3.1.4
40
+ jsonpatch==1.33
41
+ jsonpointer==3.0.0
42
+ jsonschema==4.23.0
43
+ jsonschema-specifications==2024.10.1
44
+ libarchive-c==5.1
45
+ libmambapy==1.5.10
46
+ lief==0.14.1
47
+ lintrunner==0.12.5
48
+ mamba==1.5.10
49
+ matplotlib-inline==0.1.7
50
+ menuinst==2.1.2
51
+ more-itertools==10.5.0
52
+ mpmath==1.3.0
53
+ networkx==3.4.2
54
+ ninja==1.11.1.1
55
+ nvidia-cublas-cu12==12.4.5.8
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cuda-nvrtc-cu12==12.4.127
58
+ nvidia-cuda-runtime-cu12==12.4.127
59
+ nvidia-cudnn-cu12==9.1.0.70
60
+ nvidia-cufft-cu12==11.2.1.3
61
+ nvidia-curand-cu12==10.3.5.147
62
+ nvidia-cusolver-cu12==11.6.1.9
63
+ nvidia-cusparse-cu12==12.3.1.170
64
+ nvidia-nccl-cu12==2.21.5
65
+ nvidia-nvjitlink-cu12==12.4.127
66
+ nvidia-nvtx-cu12==12.4.127
67
+ optree==0.13.0
68
+ parso==0.8.4
69
+ pexpect==4.9.0
70
+ pickleshare==0.7.5
71
+ pillow==10.2.0
72
+ pkginfo==1.11.2
73
+ pkgutil_resolve_name==1.3.10
74
+ platformdirs==4.3.6
75
+ pluggy==1.5.0
76
+ prompt_toolkit==3.0.48
77
+ psutil==6.1.0
78
+ ptyprocess==0.7.0
79
+ pure_eval==0.2.3
80
+ pycosat==0.6.6
81
+ pycparser==2.22
82
+ Pygments==2.18.0
83
+ python-etcd==0.4.5
84
+ pytz==2024.2
85
+ referencing==0.35.1
86
+ requests==2.32.3
87
+ rpds-py==0.20.0
88
+ ruamel.yaml==0.18.6
89
+ ruamel.yaml.clib==0.2.8
90
+ six==1.16.0
91
+ sortedcontainers==2.4.0
92
+ soupsieve==2.5
93
+ stack-data==0.6.2
94
+ sympy==1.13.1
95
+ torchaudio==2.5.1+cu124
96
+ torchelastic==0.2.2
97
+ tqdm==4.66.5
98
+ traitlets==5.14.3
99
+ triton==3.1.0
100
+ truststore==0.9.2
101
+ types-dataclasses==0.6.6
102
+ urllib3==2.2.3
103
+ wcwidth==0.2.13
104
+ zipp==3.20.2
105
+ zstandard==0.23.0
106
+ numpy==1.24.4
107
+ imgcat==0.6.0
108
+ decord==0.6.0
109
+ flash_attn==2.5.8
110
+ contourpy==1.3.2
111
+ cycler==0.12.1
112
+ fonttools==4.61.1
113
+ huggingface-hub==0.29.1
114
+ kiwisolver==1.4.9
115
+ matplotlib==3.7.0
116
+ opencv-python==4.7.0.72
117
+ pyarrow==11.0.0
118
+ pyparsing==3.2.5
119
+ safetensors==0.4.5
120
+ scipy==1.10.1
121
+ sentencepiece==0.1.99
122
+ torch==2.5.1
123
+ torchvision==0.20.1
124
+ transformers==4.49.0
125
+ pip==25.3
126
+ setuptools==80.9.0
127
+ wheel==0.45.1
128
+ Pebble==5.1.3
129
+ accelerate==1.12.0
130
+ addftool==0.2.13
131
+ aiohappyeyeballs==2.6.1
132
+ aiohttp==3.13.2
133
+ aiohttp-cors==0.8.1
134
+ aiosignal==1.4.0
135
+ airportsdata==20250909
136
+ annotated-doc==0.0.4
137
+ annotated-types==0.7.0
138
+ antlr4-python3-runtime==4.9.3
139
+ bcrypt==5.0.0
140
+ blobfile==3.0.0
141
+ cffi==2.0.0
142
+ cloudpickle==3.1.2
143
+ codetiming==1.4.0
144
+ colorful==0.5.8
145
+ compressed-tensors==0.12.2
146
+ cryptography==46.0.3
147
+ cuda-bindings==13.1.1
148
+ cuda-pathfinder==1.3.3
149
+ cuda-python==13.1.1
150
+ datasets==4.4.1
151
+ Deprecated==1.3.1
152
+ diskcache==5.6.3
153
+ distlib==0.4.0
154
+ docstring_parser==0.17.0
155
+ easydict==1.13
156
+ fabric==3.2.2
157
+ fastapi==0.124.4
158
+ fire==0.7.1
159
+ flashinfer-python==0.2.5
160
+ frozenlist==1.8.0
161
+ gevent==25.9.1
162
+ gitdb==4.0.12
163
+ GitPython==3.1.45
164
+ google-api-core==2.28.1
165
+ google-auth==2.43.0
166
+ google-cloud-aiplatform==1.130.0
167
+ google-cloud-bigquery==3.38.0
168
+ google-cloud-core==2.5.0
169
+ google-cloud-resource-manager==1.15.0
170
+ google-cloud-storage==3.7.0
171
+ google-crc32c==1.7.1
172
+ google-genai==1.55.0
173
+ google-resumable-media==2.8.0
174
+ googleapis-common-protos==1.72.0
175
+ greenlet==3.3.0
176
+ grpc-google-iam-v1==0.14.3
177
+ grpcio==1.76.0
178
+ grpcio-status==1.76.0
179
+ hf_transfer==0.1.9
180
+ hf-xet==1.2.0
181
+ hydra-core==1.3.2
182
+ importlib_metadata==8.7.0
183
+ interegular==0.3.3
184
+ invoke==2.2.1
185
+ jiter==0.12.0
186
+ joblib==1.5.2
187
+ jsonlines==4.0.0
188
+ lark==1.3.1
189
+ latex2sympy2==1.5.4
190
+ latex2sympy2_extended==1.10.2
191
+ libtmux==0.52.1
192
+ llguidance==0.7.30
193
+ loguru==0.7.3
194
+ lxml==6.0.2
195
+ math-verify==0.8.0
196
+ modelscope==1.33.0
197
+ msgpack==1.1.2
198
+ msgspec==0.20.0
199
+ multidict==6.7.0
200
+ multiprocess==0.70.18
201
+ nvidia-cusparselt-cu12==0.6.2
202
+ nvidia-ml-py==13.590.44
203
+ omegaconf==2.3.0
204
+ openai==2.11.0
205
+ opencensus==0.11.4
206
+ opencensus-context==0.1.3
207
+ opentelemetry-api==1.39.1
208
+ opentelemetry-exporter-prometheus==0.60b1
209
+ opentelemetry-proto==1.39.1
210
+ opentelemetry-sdk==1.39.1
211
+ opentelemetry-semantic-conventions==0.60b1
212
+ orjson==3.11.5
213
+ outlines==0.1.11
214
+ outlines_core==0.1.26
215
+ packaging==25.0
216
+ pandas==2.3.3
217
+ parallel-ssh==2.16.0.post1
218
+ paramiko==4.0.0
219
+ partial-json-parser==0.2.1.1.post7
220
+ peft==0.18.0
221
+ propcache==0.4.1
222
+ proto-plus==1.26.1
223
+ protobuf==6.33.2
224
+ py-spy==0.4.1
225
+ pyasn1==0.6.1
226
+ pyasn1_modules==0.4.2
227
+ pybind11==3.0.1
228
+ pycountry==24.6.1
229
+ pycryptodomex==3.23.0
230
+ pydantic==2.12.5
231
+ pydantic_core==2.41.5
232
+ pylatexenc==2.10
233
+ PyNaCl==1.6.1
234
+ pynvml==13.0.1
235
+ python-multipart==0.0.20
236
+ ray==2.52.1
237
+ regex==2025.11.3
238
+ rsa==4.9.1
239
+ scikit-learn==1.8.0
240
+ sentence-transformers==5.2.0
241
+ sentry-sdk==2.47.0
242
+ setproctitle==1.3.7
243
+ sgl-kernel==0.1.4
244
+ sglang==0.4.6.post5
245
+ shapely==2.1.2
246
+ smart_open==7.5.0
247
+ smmap==5.0.2
248
+ sniffio==1.3.1
249
+ soundfile==0.13.1
250
+ ssh2-python==1.2.0.post1
251
+ ssh-python==1.2.0.post1
252
+ starlette==0.50.0
253
+ tabulate==0.9.0
254
+ tenacity==9.1.2
255
+ tensorboardX==2.6.4
256
+ tensordict==0.6.2
257
+ termcolor==3.2.0
258
+ threadpoolctl==3.6.0
259
+ tiktoken==0.12.0
260
+ timeout-decorator==0.5.0
261
+ tmuxp==1.61.0
262
+ tokenizers==0.21.4
263
+ torch_memory_saver==0.0.9
264
+ torchao==0.9.0
265
+ torchdata==0.11.0
266
+ typing-inspection==0.4.2
267
+ uvicorn==0.38.0
268
+ uvloop==0.22.1
269
+ virtualenv==20.35.4
270
+ wandb==0.23.1
271
+ websockets==15.0.1
272
+ word2number==1.1
273
+ wrapt==2.0.1
274
+ xgrammar==0.1.19
275
+ xxhash==3.6.0
276
+ yarl==1.22.0
277
+ zope.event==6.1
278
+ zope.interface==8.1.1
279
+ cachetools==6.2.3
280
+ dill==0.4.0
281
+ inflect==7.5.0
282
+ lazy_loader==0.4
283
+ rp==0.1.1333
284
+ stackprinter==0.2.12
285
+ typeguard==4.4.4
286
+ typing_extensions==4.15.0
287
+ asciinema==2.4.0
288
+ einops==0.8.1
289
+ Send2Trash==1.8.3
290
+ anyio==4.12.0
291
+ argon2-cffi==25.1.0
292
+ argon2-cffi-bindings==25.1.0
293
+ arrow==1.4.0
294
+ async-lru==2.0.5
295
+ babel==2.17.0
296
+ bleach==6.3.0
297
+ comm==0.2.3
298
+ debugpy==1.8.18
299
+ defusedxml==0.7.1
300
+ fastjsonschema==2.21.2
301
+ fqdn==1.5.1
302
+ h11==0.16.0
303
+ httpcore==1.0.9
304
+ httpx==0.28.1
305
+ ipykernel==7.1.0
306
+ isoduration==20.11.0
307
+ json5==0.12.1
308
+ jupyter_client==8.7.0
309
+ jupyter_core==5.9.1
310
+ jupyter-events==0.12.0
311
+ jupyter-lsp==2.3.0
312
+ jupyter_server==2.17.0
313
+ jupyter_server_terminals==0.5.3
314
+ jupyterlab==4.5.0
315
+ jupyterlab_pygments==0.3.0
316
+ jupyterlab_server==2.28.0
317
+ mistune==3.1.4
318
+ nbclient==0.10.2
319
+ nbconvert==7.16.6
320
+ nbformat==5.10.4
321
+ nest-asyncio==1.6.0
322
+ notebook_shim==0.2.4
323
+ overrides==7.7.0
324
+ pandocfilters==1.5.1
325
+ prometheus_client==0.23.1
326
+ python-dateutil==2.9.0.post0
327
+ python-json-logger==4.0.0
328
+ pyzmq==27.1.0
329
+ rfc3339-validator==0.1.4
330
+ rfc3986-validator==0.1.1
331
+ terminado==0.18.1
332
+ tinycss2==1.4.0
333
+ tornado==6.5.3
334
+ tzdata==2025.3
335
+ uri-template==1.3.0
336
+ webcolors==25.10.0
337
+ webencodings==0.5.1
338
+ websocket-client==1.9.0
339
+ autocommand==2.2.2
340
+ backports.tarfile==1.2.0
341
+ importlib_metadata==8.0.0
342
+ inflect==7.3.1
343
+ jaraco.collections==5.1.0
344
+ jaraco.context==5.3.0
345
+ jaraco.functools==4.0.1
346
+ jaraco.text==3.12.1
347
+ more-itertools==10.3.0
348
+ packaging==24.2
349
+ platformdirs==4.2.2
350
+ tomli==2.0.1
351
+ typeguard==4.3.0
352
+ typing_extensions==4.12.2
353
+ wheel==0.45.1
354
+ zipp==3.19.2
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-core.log ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"time":"2026-01-04T09:32:00.835208263Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpc0hvyn70/port-13221.txt","pid":13221,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2026-01-04T09:32:00.838086455Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":13221}
3
+ {"time":"2026-01-04T09:32:00.838143904Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-13221-13288-2426694750/socket","Net":"unix"}}
4
+ {"time":"2026-01-04T09:32:00.939441588Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2026-01-04T09:32:00.950369146Z","level":"INFO","msg":"handleInformInit: received","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
6
+ {"time":"2026-01-04T09:32:01.443736617Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0","id":"1(@)"}
7
+ {"time":"2026-01-04T09:37:29.649420411Z","level":"INFO","msg":"server: parent process exited, terminating service process"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {"time":"2026-01-04T09:32:01.115692117Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2026-01-04T09:32:01.44318528Z","level":"WARN","msg":"featurechecker: GraphQL client is nil, skipping feature loading"}
3
+ {"time":"2026-01-04T09:32:01.443259623Z","level":"INFO","msg":"stream: created new stream","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
4
+ {"time":"2026-01-04T09:32:01.443726951Z","level":"INFO","msg":"stream: started","id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
5
+ {"time":"2026-01-04T09:32:01.44372382Z","level":"INFO","msg":"handler: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
6
+ {"time":"2026-01-04T09:32:01.443767451Z","level":"INFO","msg":"writer: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
7
+ {"time":"2026-01-04T09:32:01.443797476Z","level":"INFO","msg":"sender: started","stream_id":"vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0"}
8
+ {"time":"2026-01-04T09:32:01.444535545Z","level":"WARN","msg":"runupserter: server does not expand metric globs but the x_server_side_expand_glob_metrics setting is set; ignoring"}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Configure stats pid to 13221
3
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/.config/wandb/settings
4
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Loading settings from /home/clouduser/Code/Github/unified_world_model/wandb/settings
5
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug.log
7
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/logs/debug-internal.log
8
+ 2026-01-04 09:32:00,479 INFO MainThread:13221 [wandb_init.py:init():841] calling init triggers
9
+ 2026-01-04 09:32:00,480 INFO MainThread:13221 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'_wandb': {}}
11
+ 2026-01-04 09:32:00,480 INFO MainThread:13221 [wandb_init.py:init():889] starting backend
12
+ 2026-01-04 09:32:00,939 INFO MainThread:13221 [wandb_init.py:init():892] sending inform_init request
13
+ 2026-01-04 09:32:00,947 INFO MainThread:13221 [wandb_init.py:init():900] backend started and connected
14
+ 2026-01-04 09:32:00,949 INFO MainThread:13221 [wandb_init.py:init():970] updated telemetry
15
+ 2026-01-04 09:32:00,956 INFO MainThread:13221 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2026-01-04 09:32:01,446 INFO MainThread:13221 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2026-01-04 09:32:01,964 INFO MainThread:13221 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2026-01-04 09:32:01,968 INFO MainThread:13221 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2026-01-04 09:32:01,969 INFO MainThread:13221 [wandb_run.py:_config_callback():1396] config_cb None None {'visual_gen': True, 'visual_und': True, 'results_dir': 'results', 'checkpoint_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only', 'wandb_project': 'bagel', 'wandb_name': 'vlm_gym_jigsaw_one_img_lr2e_5_mse_only', 'wandb_runid': '0', 'wandb_resume': 'allow', 'wandb_offline': True, 'wandb_dir': '/dev/shm/models/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only', 'global_seed': 4396, 'auto_resume': False, 'resume_from': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'resume_model_only': True, 'finetune_from_ema': True, 'finetune_from_hf': True, 'log_every': 1, 'save_every': 2500, 'total_steps': 5000, 'warmup_steps': 300, 'lr_scheduler': 'cosine', 'lr': 2e-05, 'min_lr': 1e-07, 'beta1': 0.9, 'beta2': 0.95, 'eps': 1e-15, 'ema': 0.993, 'max_grad_norm': 1.0, 'timestep_shift': 1.0, 'mse_weight': 1.0, 'ce_weight': 1.0, 'ce_loss_reweighting': False, 'expected_num_tokens': 20000, 'num_replicate': 1, 'num_shard': 8, 'sharding_strategy': 'HYBRID_SHARD', 'backward_prefetch': 'BACKWARD_PRE', 'cpu_offload': False, 'freeze_llm': False, 'freeze_vit': False, 'freeze_vae': True, 'freeze_und': False, 'copy_init_moe': True, 'use_flex': False, 'eval_every': 500, 'num_eval_batches': 20, 'use_ema_for_eval': True, 'eval_log_dir': None, 'eval_run_tag': '', 'viz_every': 500, 'viz_n': 8, 'viz_outdir': 'results/viz', 'eval_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'viz_dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'eval_print_n': 3, 'save_ema_only': True, 'save_optimizer': False}
23
+ 2026-01-04 09:32:01,970 INFO MainThread:13221 [wandb_run.py:_config_callback():1396] config_cb None None {'model_path': '/home/clouduser/Code/Models/BAGEL-7B-MoT', 'llm_path': 'hf/Qwen2.5-0.5B-Instruct/', 'llm_qk_norm': True, 'tie_word_embeddings': False, 'layer_module': 'Qwen2MoTDecoderLayer', 'vae_path': 'flux/vae/ae.safetensors', 'vit_path': 'hf/siglip-so400m-14-980-flash-attn2-navit/', 'max_latent_size': 64, 'latent_patch_size': 2, 'vit_patch_size': 14, 'vit_max_num_patch_per_side': 70, 'connector_act': 'gelu_pytorch_tanh', 'interpolate_pos': False, 'vit_select_layer': -2, 'vit_rope': False, 'text_cond_dropout_prob': 0.0, 'vae_cond_dropout_prob': 0.0, 'vit_cond_dropout_prob': 0.0}
24
+ 2026-01-04 09:32:01,971 INFO MainThread:13221 [wandb_run.py:_config_callback():1396] config_cb None None {'dataset_config_file': './data/configs/vlm_gym_jigsaw_train_mseloss_only.yaml', 'train_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'train_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/train/', 'eval_data_dir': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'eval_jsonl_path': '/home/clouduser/Code/data/gym/jigsaw-swap_v5/val/', 'inference_hash_file': '/home/clouduser/Code/Github/launch_new/hashes_test_set_v10.json', 'prefetch_factor': 2, 'num_workers': 1, 'max_num_tokens_per_sample': 20000, 'max_num_tokens': 20000, 'prefer_buffer_before': 16384, 'max_buffer_size': 50, 'data_seed': 42}
checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/checkpoints_vlm_gym_jigsaw_one_image_lr2e_5_mse_only/wandb/offline-run-20260104_093200-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0/run-vlm_gym_jigsaw_one_img_lr2e_5_mse_only-run0.wandb ADDED
Binary file (65.5 kB). View file