{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.304498269896194, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11534025374855825, "grad_norm": 40.231910705566406, "learning_rate": 3.6363636363636364e-05, "loss": 0.2361, "mean_token_accuracy": 0.9513261806964874, "num_tokens": 776427.0, "step": 25 }, { "epoch": 0.2306805074971165, "grad_norm": 12.624946594238281, "learning_rate": 7.424242424242424e-05, "loss": 0.0839, "mean_token_accuracy": 0.9820117330551148, "num_tokens": 1545851.0, "step": 50 }, { "epoch": 0.3460207612456747, "grad_norm": 7.605620384216309, "learning_rate": 9.99538638664362e-05, "loss": 0.0841, "mean_token_accuracy": 0.9822509652376175, "num_tokens": 2316893.0, "step": 75 }, { "epoch": 0.461361014994233, "grad_norm": 5.328527927398682, "learning_rate": 9.921689684362989e-05, "loss": 0.0856, "mean_token_accuracy": 0.9817710053920746, "num_tokens": 3092710.0, "step": 100 }, { "epoch": 0.5767012687427913, "grad_norm": 4.388030052185059, "learning_rate": 9.759414175528672e-05, "loss": 0.0785, "mean_token_accuracy": 0.983220465183258, "num_tokens": 3868554.0, "step": 125 }, { "epoch": 0.6920415224913494, "grad_norm": 2.8011012077331543, "learning_rate": 9.511480436574225e-05, "loss": 0.0771, "mean_token_accuracy": 0.9840185779333115, "num_tokens": 4646065.0, "step": 150 }, { "epoch": 0.8073817762399077, "grad_norm": 2.1058194637298584, "learning_rate": 9.182350690051133e-05, "loss": 0.0743, "mean_token_accuracy": 0.9849942177534103, "num_tokens": 5416620.0, "step": 175 }, { "epoch": 0.922722029988466, "grad_norm": 2.37309193611145, "learning_rate": 8.777948495147001e-05, "loss": 0.0701, "mean_token_accuracy": 0.9855723404884338, "num_tokens": 6194197.0, "step": 200 }, { "epoch": 1.0369088811995386, "grad_norm": 1.7108154296875, "learning_rate": 8.305552137590832e-05, "loss": 0.0654, "mean_token_accuracy": 0.9864181063391946, "num_tokens": 6962936.0, "step": 225 }, { "epoch": 1.152249134948097, "grad_norm": 1.7341214418411255, "learning_rate": 7.773663637675694e-05, "loss": 0.0596, "mean_token_accuracy": 0.987049458026886, "num_tokens": 7736724.0, "step": 250 }, { "epoch": 1.2675893886966552, "grad_norm": 0.9936047792434692, "learning_rate": 7.191855733945387e-05, "loss": 0.0614, "mean_token_accuracy": 0.9867785835266113, "num_tokens": 8508686.0, "step": 275 }, { "epoch": 1.3829296424452133, "grad_norm": 0.9117940664291382, "learning_rate": 6.570599596477876e-05, "loss": 0.0601, "mean_token_accuracy": 0.986804239153862, "num_tokens": 9286077.0, "step": 300 }, { "epoch": 1.4982698961937717, "grad_norm": 0.961225688457489, "learning_rate": 5.921076370520058e-05, "loss": 0.0577, "mean_token_accuracy": 0.9875673747062683, "num_tokens": 10055871.0, "step": 325 }, { "epoch": 1.6136101499423299, "grad_norm": 0.9213497638702393, "learning_rate": 5.254975942243963e-05, "loss": 0.0697, "mean_token_accuracy": 0.9852276796102524, "num_tokens": 10832689.0, "step": 350 }, { "epoch": 1.728950403690888, "grad_norm": 0.4215187728404999, "learning_rate": 4.584286548366148e-05, "loss": 0.0532, "mean_token_accuracy": 0.9881923282146454, "num_tokens": 11602711.0, "step": 375 }, { "epoch": 1.8442906574394464, "grad_norm": 1.4110305309295654, "learning_rate": 3.92107901616097e-05, "loss": 0.0517, "mean_token_accuracy": 0.9885529279708862, "num_tokens": 12378473.0, "step": 400 }, { "epoch": 1.9596309111880046, "grad_norm": 0.6333124041557312, "learning_rate": 3.277289517038582e-05, "loss": 0.0508, "mean_token_accuracy": 0.98875912129879, "num_tokens": 13155639.0, "step": 425 }, { "epoch": 2.0738177623990772, "grad_norm": 0.5025136470794678, "learning_rate": 2.6645047436109226e-05, "loss": 0.0393, "mean_token_accuracy": 0.9906098294739771, "num_tokens": 13924965.0, "step": 450 }, { "epoch": 2.1891580161476356, "grad_norm": 0.4080626964569092, "learning_rate": 2.0937533765518187e-05, "loss": 0.0315, "mean_token_accuracy": 0.9918009847402572, "num_tokens": 14700622.0, "step": 475 }, { "epoch": 2.304498269896194, "grad_norm": 0.4394913911819458, "learning_rate": 1.5753075943558987e-05, "loss": 0.0317, "mean_token_accuracy": 0.9918285059928894, "num_tokens": 15470240.0, "step": 500 } ], "logging_steps": 25, "max_steps": 651, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.386289764145562e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }