| { | |
| "data": { | |
| "train_files": "data/fineweb_10B_gpt2/fineweb_train_*.bin", | |
| "val_files": "data/fineweb_10B_gpt2/fineweb_val_*.bin", | |
| "batch_size": 512, | |
| "device_batch_size": 32 | |
| }, | |
| "model": { | |
| "dim": 768, | |
| "n_layers": 8, | |
| "n_heads": 12, | |
| "n_kv_heads": 12, | |
| "vocab_size": 50257, | |
| "multiple_of": 256, | |
| "ffn_dim_multiplier": 4, | |
| "norm_eps": 1e-05, | |
| "rope_theta": 10000, | |
| "use_scaled_rope": false, | |
| "max_seq_len": 1024, | |
| "initializer_range": 0.02, | |
| "zero_init_masks": true | |
| }, | |
| "optimizer": { | |
| "default": { | |
| "lr": 0.001, | |
| "beta1": 0.8, | |
| "beta2": 0.95, | |
| "eps": 1e-10, | |
| "weight_decay": 0 | |
| }, | |
| "masks": { | |
| "lr": 0.001, | |
| "beta1": 0.8, | |
| "beta2": 0.95, | |
| "eps": 1e-10, | |
| "weight_decay": 0 | |
| }, | |
| "norms": { | |
| "lr": 0.001, | |
| "beta1": 0.8, | |
| "beta2": 0.95, | |
| "eps": 1e-10, | |
| "weight_decay": 0 | |
| } | |
| }, | |
| "scheduler": { | |
| "warmup_steps": 0.1, | |
| "start_factor": 0.1 | |
| }, | |
| "gates": {}, | |
| "gates_zero_eps": 1e-08, | |
| "seed": 0, | |
| "project": "fineweb-baseline", | |
| "run_id": null, | |
| "logdir": "logs/fineweb-baseline", | |
| "log_gradients": false, | |
| "log_params": false, | |
| "log_every_steps": 1, | |
| "val_every_steps": 100, | |
| "save_every_steps": -1 | |
| } |