arch: H_cycles: 3 H_layers: 0 L_cycles: 6 L_layers: 2 expansion: 4 ffn_hybrid: true ffn_hybrid_gate: scalar ffn_kron_chunk_k: 16 ffn_kron_chunk_t: 4096 ffn_kron_in_factors: &id001 !!python/object:omegaconf.listconfig.ListConfig _content: - !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: 0 object_type: null optional: true ref_type: &id002 !!python/name:typing.Any '' resolver_cache: !!python/object/apply:collections.defaultdict - &id003 !!python/name:builtins.dict '' _parent: *id001 _val: 32 - !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: 1 object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id001 _val: 16 _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: ffn_kron_in_factors key_type: &id006 !!python/name:builtins.int '' object_type: &id007 !!python/name:builtins.list '' optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: &id004 !!python/object:omegaconf.dictconfig.DictConfig _content: H_cycles: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: H_cycles object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 3 H_layers: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: H_layers object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 0 L_cycles: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: L_cycles object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 6 L_layers: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: L_layers object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 2 expansion: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: expansion object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 4 ffn_hybrid: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_hybrid object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: true ffn_hybrid_gate: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_hybrid_gate object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: scalar ffn_kron_chunk_k: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_kron_chunk_k object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 16 ffn_kron_chunk_t: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_kron_chunk_t object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 4096 ffn_kron_in_factors: *id001 ffn_kron_out_factors: &id005 !!python/object:omegaconf.listconfig.ListConfig _content: - !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: 0 object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id005 _val: 48 - !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: 1 object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id005 _val: 32 _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: ffn_kron_out_factors key_type: *id006 object_type: *id007 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 ffn_kron_share_gate_up: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_kron_share_gate_up object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: true ffn_kron_terms: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_kron_terms object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 96 ffn_rank: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_rank object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 128 ffn_type: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ffn_type object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: mlp forward_dtype: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: forward_dtype object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: bfloat16 halt_exploration_prob: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: halt_exploration_prob object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 0.1 halt_max_steps: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: halt_max_steps object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 16 hidden_size: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: hidden_size object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 512 loss: &id008 !!python/object:omegaconf.dictconfig.DictConfig _content: loss_type: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: loss_type object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id008 _val: stablemax_cross_entropy name: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: name object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id008 _val: losses@ACTLossHead _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: loss key_type: *id002 object_type: *id003 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 mixer: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: mixer object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: attention mlp_t: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: mlp_t object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: true name: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: name object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: recursive_reasoning.trm@TinyRecursiveReasoningModel_ACTV1 no_ACT_continue: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: no_ACT_continue object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: true num_heads: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: num_heads object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 8 pos_encodings: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: pos_encodings object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: rope puzzle_emb_len: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: puzzle_emb_len object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: 16 puzzle_emb_ndim: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: puzzle_emb_ndim object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id004 _val: ${.hidden_size} _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: arch key_type: *id002 object_type: *id003 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: &id009 !!python/object:omegaconf.dictconfig.DictConfig _content: arch: *id004 beta1: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: beta1 object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0.9 beta2: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: beta2 object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0.95 checkpoint_every_eval: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: checkpoint_every_eval object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: true data_paths: &id010 !!python/object:omegaconf.listconfig.ListConfig _content: - !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: 0 object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id010 _val: data/sudoku-extreme-1k-aug-1000 _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: data_paths key_type: *id006 object_type: *id007 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 data_paths_test: !!python/object:omegaconf.listconfig.ListConfig _content: [] _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: data_paths_test key_type: *id006 object_type: *id007 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 ema: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ema object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: true ema_rate: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: ema_rate object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0.999 epochs: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: epochs object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 50000 eval_interval: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: eval_interval object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 5000 evaluators: !!python/object:omegaconf.listconfig.ListConfig _content: [] _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: {} flags_root: false key: evaluators key_type: *id006 object_type: *id007 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 freeze_weights: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: freeze_weights object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: false global_batch_size: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: global_batch_size object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 768 lr: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: lr object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0.0001 lr_min_ratio: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: lr_min_ratio object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 1.0 lr_warmup_steps: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: lr_warmup_steps object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 2000 min_eval_interval: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: min_eval_interval object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0 puzzle_emb_lr: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: puzzle_emb_lr object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0.0001 puzzle_emb_weight_decay: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: puzzle_emb_weight_decay object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 1.0 run_name: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: run_name object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: pretrain_hybrid_sudoku seed: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: seed object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 0 weight_decay: !!python/object:omegaconf.nodes.AnyNode _metadata: !!python/object:omegaconf.base.Metadata flags: {} flags_root: false key: weight_decay object_type: null optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict - *id003 _parent: *id009 _val: 1.0 _metadata: !!python/object:omegaconf.base.ContainerMetadata element_type: *id002 flags: struct: true flags_root: false key: null key_type: *id002 object_type: *id003 optional: true ref_type: *id002 resolver_cache: !!python/object/apply:collections.defaultdict args: - *id003 dictitems: now: ? !!python/tuple - '%H-%M-%S' : 09-39-56 ? !!python/tuple - '%Y-%m-%d' : '2025-10-11' _parent: null ffn_kron_out_factors: *id005 ffn_kron_share_gate_up: true ffn_kron_terms: 96 ffn_rank: 128 ffn_type: mlp forward_dtype: bfloat16 halt_exploration_prob: 0.1 halt_max_steps: 16 hidden_size: 512 loss: loss_type: stablemax_cross_entropy name: losses@ACTLossHead mixer: attention mlp_t: true name: recursive_reasoning.trm@TinyRecursiveReasoningModel_ACTV1 no_ACT_continue: true num_heads: 8 pos_encodings: rope puzzle_emb_len: 16 puzzle_emb_ndim: 512 beta1: 0.9 beta2: 0.95 checkpoint_every_eval: true checkpoint_path: checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_hybrid_sudoku data_paths: - data/sudoku-extreme-1k-aug-1000 data_paths_test: [] ema: true ema_rate: 0.999 epochs: 50000 eval_interval: 5000 eval_save_outputs: [] evaluators: [] freeze_weights: false global_batch_size: 768 load_checkpoint: null lr: 0.0001 lr_min_ratio: 1.0 lr_warmup_steps: 2000 min_eval_interval: 0 project_name: Sudoku-extreme-1k-aug-1000-ACT-torch puzzle_emb_lr: 0.0001 puzzle_emb_weight_decay: 1.0 run_name: pretrain_hybrid_sudoku seed: 0 weight_decay: 1.0