Viharikvs's picture
Upload folder using huggingface_hub
08b7e03 verified
arch:
H_cycles: 3
H_layers: 0
L_cycles: 6
L_layers: 2
expansion: 4
ffn_hybrid: true
ffn_hybrid_gate: scalar
ffn_kron_chunk_k: 16
ffn_kron_chunk_t: 4096
ffn_kron_in_factors: &id001 !!python/object:omegaconf.listconfig.ListConfig
_content:
- !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: 0
object_type: null
optional: true
ref_type: &id002 !!python/name:typing.Any ''
resolver_cache: !!python/object/apply:collections.defaultdict
- &id003 !!python/name:builtins.dict ''
_parent: *id001
_val: 32
- !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: 1
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id001
_val: 16
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: ffn_kron_in_factors
key_type: &id006 !!python/name:builtins.int ''
object_type: &id007 !!python/name:builtins.list ''
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: &id004 !!python/object:omegaconf.dictconfig.DictConfig
_content:
H_cycles: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: H_cycles
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 3
H_layers: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: H_layers
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 0
L_cycles: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: L_cycles
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 6
L_layers: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: L_layers
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 2
expansion: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: expansion
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 4
ffn_hybrid: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_hybrid
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: true
ffn_hybrid_gate: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_hybrid_gate
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: scalar
ffn_kron_chunk_k: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_kron_chunk_k
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 16
ffn_kron_chunk_t: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_kron_chunk_t
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 4096
ffn_kron_in_factors: *id001
ffn_kron_out_factors: &id005 !!python/object:omegaconf.listconfig.ListConfig
_content:
- !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: 0
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id005
_val: 48
- !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: 1
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id005
_val: 32
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: ffn_kron_out_factors
key_type: *id006
object_type: *id007
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
ffn_kron_share_gate_up: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_kron_share_gate_up
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: true
ffn_kron_terms: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_kron_terms
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 96
ffn_rank: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_rank
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 128
ffn_type: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ffn_type
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: mlp
forward_dtype: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: forward_dtype
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: bfloat16
halt_exploration_prob: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: halt_exploration_prob
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 0.1
halt_max_steps: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: halt_max_steps
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 16
hidden_size: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: hidden_size
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 512
loss: &id008 !!python/object:omegaconf.dictconfig.DictConfig
_content:
loss_type: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: loss_type
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id008
_val: stablemax_cross_entropy
name: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: name
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id008
_val: losses@ACTLossHead
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: loss
key_type: *id002
object_type: *id003
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
mixer: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: mixer
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: attention
mlp_t: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: mlp_t
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: true
name: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: name
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: recursive_reasoning.trm@TinyRecursiveReasoningModel_ACTV1
no_ACT_continue: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: no_ACT_continue
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: true
num_heads: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: num_heads
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 8
pos_encodings: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: pos_encodings
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: rope
puzzle_emb_len: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: puzzle_emb_len
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: 16
puzzle_emb_ndim: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: puzzle_emb_ndim
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id004
_val: ${.hidden_size}
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: arch
key_type: *id002
object_type: *id003
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: &id009 !!python/object:omegaconf.dictconfig.DictConfig
_content:
arch: *id004
beta1: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: beta1
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0.9
beta2: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: beta2
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0.95
checkpoint_every_eval: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: checkpoint_every_eval
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: true
data_paths: &id010 !!python/object:omegaconf.listconfig.ListConfig
_content:
- !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: 0
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id010
_val: data/sudoku-extreme-1k-aug-1000
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: data_paths
key_type: *id006
object_type: *id007
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
data_paths_test: !!python/object:omegaconf.listconfig.ListConfig
_content: []
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: data_paths_test
key_type: *id006
object_type: *id007
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
ema: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ema
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: true
ema_rate: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: ema_rate
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0.999
epochs: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: epochs
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 50000
eval_interval: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: eval_interval
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 5000
evaluators: !!python/object:omegaconf.listconfig.ListConfig
_content: []
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags: {}
flags_root: false
key: evaluators
key_type: *id006
object_type: *id007
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
freeze_weights: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: freeze_weights
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: false
global_batch_size: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: global_batch_size
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 768
lr: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: lr
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0.0001
lr_min_ratio: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: lr_min_ratio
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 1.0
lr_warmup_steps: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: lr_warmup_steps
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 2000
min_eval_interval: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: min_eval_interval
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0
puzzle_emb_lr: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: puzzle_emb_lr
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0.0001
puzzle_emb_weight_decay: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: puzzle_emb_weight_decay
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 1.0
run_name: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: run_name
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: pretrain_hybrid_sudoku
seed: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: seed
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 0
weight_decay: !!python/object:omegaconf.nodes.AnyNode
_metadata: !!python/object:omegaconf.base.Metadata
flags: {}
flags_root: false
key: weight_decay
object_type: null
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
- *id003
_parent: *id009
_val: 1.0
_metadata: !!python/object:omegaconf.base.ContainerMetadata
element_type: *id002
flags:
struct: true
flags_root: false
key: null
key_type: *id002
object_type: *id003
optional: true
ref_type: *id002
resolver_cache: !!python/object/apply:collections.defaultdict
args:
- *id003
dictitems:
now:
? !!python/tuple
- '%H-%M-%S'
: 09-39-56
? !!python/tuple
- '%Y-%m-%d'
: '2025-10-11'
_parent: null
ffn_kron_out_factors: *id005
ffn_kron_share_gate_up: true
ffn_kron_terms: 96
ffn_rank: 128
ffn_type: mlp
forward_dtype: bfloat16
halt_exploration_prob: 0.1
halt_max_steps: 16
hidden_size: 512
loss:
loss_type: stablemax_cross_entropy
name: losses@ACTLossHead
mixer: attention
mlp_t: true
name: recursive_reasoning.trm@TinyRecursiveReasoningModel_ACTV1
no_ACT_continue: true
num_heads: 8
pos_encodings: rope
puzzle_emb_len: 16
puzzle_emb_ndim: 512
beta1: 0.9
beta2: 0.95
checkpoint_every_eval: true
checkpoint_path: checkpoints/Sudoku-extreme-1k-aug-1000-ACT-torch/pretrain_hybrid_sudoku
data_paths:
- data/sudoku-extreme-1k-aug-1000
data_paths_test: []
ema: true
ema_rate: 0.999
epochs: 50000
eval_interval: 5000
eval_save_outputs: []
evaluators: []
freeze_weights: false
global_batch_size: 768
load_checkpoint: null
lr: 0.0001
lr_min_ratio: 1.0
lr_warmup_steps: 2000
min_eval_interval: 0
project_name: Sudoku-extreme-1k-aug-1000-ACT-torch
puzzle_emb_lr: 0.0001
puzzle_emb_weight_decay: 1.0
run_name: pretrain_hybrid_sudoku
seed: 0
weight_decay: 1.0