| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.304498269896194, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11534025374855825, | |
| "grad_norm": 40.231910705566406, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.2361, | |
| "mean_token_accuracy": 0.9513261806964874, | |
| "num_tokens": 776427.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2306805074971165, | |
| "grad_norm": 12.624946594238281, | |
| "learning_rate": 7.424242424242424e-05, | |
| "loss": 0.0839, | |
| "mean_token_accuracy": 0.9820117330551148, | |
| "num_tokens": 1545851.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3460207612456747, | |
| "grad_norm": 7.605620384216309, | |
| "learning_rate": 9.99538638664362e-05, | |
| "loss": 0.0841, | |
| "mean_token_accuracy": 0.9822509652376175, | |
| "num_tokens": 2316893.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.461361014994233, | |
| "grad_norm": 5.328527927398682, | |
| "learning_rate": 9.921689684362989e-05, | |
| "loss": 0.0856, | |
| "mean_token_accuracy": 0.9817710053920746, | |
| "num_tokens": 3092710.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5767012687427913, | |
| "grad_norm": 4.388030052185059, | |
| "learning_rate": 9.759414175528672e-05, | |
| "loss": 0.0785, | |
| "mean_token_accuracy": 0.983220465183258, | |
| "num_tokens": 3868554.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6920415224913494, | |
| "grad_norm": 2.8011012077331543, | |
| "learning_rate": 9.511480436574225e-05, | |
| "loss": 0.0771, | |
| "mean_token_accuracy": 0.9840185779333115, | |
| "num_tokens": 4646065.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8073817762399077, | |
| "grad_norm": 2.1058194637298584, | |
| "learning_rate": 9.182350690051133e-05, | |
| "loss": 0.0743, | |
| "mean_token_accuracy": 0.9849942177534103, | |
| "num_tokens": 5416620.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.922722029988466, | |
| "grad_norm": 2.37309193611145, | |
| "learning_rate": 8.777948495147001e-05, | |
| "loss": 0.0701, | |
| "mean_token_accuracy": 0.9855723404884338, | |
| "num_tokens": 6194197.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0369088811995386, | |
| "grad_norm": 1.7108154296875, | |
| "learning_rate": 8.305552137590832e-05, | |
| "loss": 0.0654, | |
| "mean_token_accuracy": 0.9864181063391946, | |
| "num_tokens": 6962936.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.152249134948097, | |
| "grad_norm": 1.7341214418411255, | |
| "learning_rate": 7.773663637675694e-05, | |
| "loss": 0.0596, | |
| "mean_token_accuracy": 0.987049458026886, | |
| "num_tokens": 7736724.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.2675893886966552, | |
| "grad_norm": 0.9936047792434692, | |
| "learning_rate": 7.191855733945387e-05, | |
| "loss": 0.0614, | |
| "mean_token_accuracy": 0.9867785835266113, | |
| "num_tokens": 8508686.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.3829296424452133, | |
| "grad_norm": 0.9117940664291382, | |
| "learning_rate": 6.570599596477876e-05, | |
| "loss": 0.0601, | |
| "mean_token_accuracy": 0.986804239153862, | |
| "num_tokens": 9286077.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4982698961937717, | |
| "grad_norm": 0.961225688457489, | |
| "learning_rate": 5.921076370520058e-05, | |
| "loss": 0.0577, | |
| "mean_token_accuracy": 0.9875673747062683, | |
| "num_tokens": 10055871.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.6136101499423299, | |
| "grad_norm": 0.9213497638702393, | |
| "learning_rate": 5.254975942243963e-05, | |
| "loss": 0.0697, | |
| "mean_token_accuracy": 0.9852276796102524, | |
| "num_tokens": 10832689.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.728950403690888, | |
| "grad_norm": 0.4215187728404999, | |
| "learning_rate": 4.584286548366148e-05, | |
| "loss": 0.0532, | |
| "mean_token_accuracy": 0.9881923282146454, | |
| "num_tokens": 11602711.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.8442906574394464, | |
| "grad_norm": 1.4110305309295654, | |
| "learning_rate": 3.92107901616097e-05, | |
| "loss": 0.0517, | |
| "mean_token_accuracy": 0.9885529279708862, | |
| "num_tokens": 12378473.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.9596309111880046, | |
| "grad_norm": 0.6333124041557312, | |
| "learning_rate": 3.277289517038582e-05, | |
| "loss": 0.0508, | |
| "mean_token_accuracy": 0.98875912129879, | |
| "num_tokens": 13155639.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.0738177623990772, | |
| "grad_norm": 0.5025136470794678, | |
| "learning_rate": 2.6645047436109226e-05, | |
| "loss": 0.0393, | |
| "mean_token_accuracy": 0.9906098294739771, | |
| "num_tokens": 13924965.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.1891580161476356, | |
| "grad_norm": 0.4080626964569092, | |
| "learning_rate": 2.0937533765518187e-05, | |
| "loss": 0.0315, | |
| "mean_token_accuracy": 0.9918009847402572, | |
| "num_tokens": 14700622.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.304498269896194, | |
| "grad_norm": 0.4394913911819458, | |
| "learning_rate": 1.5753075943558987e-05, | |
| "loss": 0.0317, | |
| "mean_token_accuracy": 0.9918285059928894, | |
| "num_tokens": 15470240.0, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 651, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.386289764145562e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |