| { | |
| "best_metric": 0.07254856824874878, | |
| "best_model_checkpoint": "./teapotllm/checkpoint-1224", | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 8160, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.07505040615797043, | |
| "learning_rate": 4.75e-05, | |
| "loss": 0.0953, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.07464946806430817, | |
| "eval_runtime": 4.9963, | |
| "eval_samples_per_second": 45.034, | |
| "eval_steps_per_second": 5.804, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.3880476951599121, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.0706, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.0732811763882637, | |
| "eval_runtime": 4.9711, | |
| "eval_samples_per_second": 45.262, | |
| "eval_steps_per_second": 5.834, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.514928936958313, | |
| "learning_rate": 4.25e-05, | |
| "loss": 0.0579, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.07254856824874878, | |
| "eval_runtime": 4.9773, | |
| "eval_samples_per_second": 45.205, | |
| "eval_steps_per_second": 5.826, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.4975501000881195, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0493, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.07370911538600922, | |
| "eval_runtime": 4.9687, | |
| "eval_samples_per_second": 45.284, | |
| "eval_steps_per_second": 5.837, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.588830292224884, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0427, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.07484618574380875, | |
| "eval_runtime": 4.9726, | |
| "eval_samples_per_second": 45.248, | |
| "eval_steps_per_second": 5.832, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.07073836028575897, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.0376, | |
| "step": 2448 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.07779362797737122, | |
| "eval_runtime": 4.9773, | |
| "eval_samples_per_second": 45.205, | |
| "eval_steps_per_second": 5.826, | |
| "step": 2448 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.638589084148407, | |
| "learning_rate": 3.2500000000000004e-05, | |
| "loss": 0.0324, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.07828149944543839, | |
| "eval_runtime": 4.984, | |
| "eval_samples_per_second": 45.145, | |
| "eval_steps_per_second": 5.819, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.11211636662483215, | |
| "learning_rate": 3e-05, | |
| "loss": 0.0291, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.08292075246572495, | |
| "eval_runtime": 4.9953, | |
| "eval_samples_per_second": 45.042, | |
| "eval_steps_per_second": 5.805, | |
| "step": 3264 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.2681402266025543, | |
| "learning_rate": 2.7500000000000004e-05, | |
| "loss": 0.0264, | |
| "step": 3672 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.08268510550260544, | |
| "eval_runtime": 4.9786, | |
| "eval_samples_per_second": 45.194, | |
| "eval_steps_per_second": 5.825, | |
| "step": 3672 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.26614490151405334, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.023, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.08475232124328613, | |
| "eval_runtime": 4.9942, | |
| "eval_samples_per_second": 45.052, | |
| "eval_steps_per_second": 5.807, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.05123787373304367, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.0217, | |
| "step": 4488 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.08677990734577179, | |
| "eval_runtime": 5.0004, | |
| "eval_samples_per_second": 44.996, | |
| "eval_steps_per_second": 5.8, | |
| "step": 4488 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.8446316719055176, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0196, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.08961891382932663, | |
| "eval_runtime": 4.9782, | |
| "eval_samples_per_second": 45.197, | |
| "eval_steps_per_second": 5.825, | |
| "step": 4896 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.32860517501831055, | |
| "learning_rate": 1.75e-05, | |
| "loss": 0.0178, | |
| "step": 5304 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.09325850754976273, | |
| "eval_runtime": 4.9928, | |
| "eval_samples_per_second": 45.065, | |
| "eval_steps_per_second": 5.808, | |
| "step": 5304 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.9496984481811523, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.0167, | |
| "step": 5712 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.09426513314247131, | |
| "eval_runtime": 4.9809, | |
| "eval_samples_per_second": 45.173, | |
| "eval_steps_per_second": 5.822, | |
| "step": 5712 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.056426361203193665, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.016, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.09544987976551056, | |
| "eval_runtime": 4.9829, | |
| "eval_samples_per_second": 45.155, | |
| "eval_steps_per_second": 5.82, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.05803034454584122, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0147, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.09645407646894455, | |
| "eval_runtime": 4.9673, | |
| "eval_samples_per_second": 45.296, | |
| "eval_steps_per_second": 5.838, | |
| "step": 6528 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.1461056023836136, | |
| "learning_rate": 7.5e-06, | |
| "loss": 0.0143, | |
| "step": 6936 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.09935282170772552, | |
| "eval_runtime": 4.9798, | |
| "eval_samples_per_second": 45.182, | |
| "eval_steps_per_second": 5.824, | |
| "step": 6936 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.007102633360773325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0136, | |
| "step": 7344 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.1016102060675621, | |
| "eval_runtime": 4.9718, | |
| "eval_samples_per_second": 45.255, | |
| "eval_steps_per_second": 5.833, | |
| "step": 7344 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.011653387919068336, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0135, | |
| "step": 7752 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.10058598965406418, | |
| "eval_runtime": 4.9846, | |
| "eval_samples_per_second": 45.139, | |
| "eval_steps_per_second": 5.818, | |
| "step": 7752 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.7057294249534607, | |
| "learning_rate": 0.0, | |
| "loss": 0.013, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.10097935050725937, | |
| "eval_runtime": 4.9778, | |
| "eval_samples_per_second": 45.201, | |
| "eval_steps_per_second": 5.826, | |
| "step": 8160 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 8160, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.11615448449024e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |