| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.627450980392156, | |
| "eval_steps": 10, | |
| "global_step": 180, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.26143790849673204, | |
| "grad_norm": 80.0328369140625, | |
| "learning_rate": 1.9682539682539684e-05, | |
| "loss": 22.6724, | |
| "mean_token_accuracy": 0.49049999862909316, | |
| "num_tokens": 49120.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.26143790849673204, | |
| "eval_loss": 4.981190204620361, | |
| "eval_mean_token_accuracy": 0.5223076802033645, | |
| "eval_num_tokens": 49120.0, | |
| "eval_runtime": 17.5192, | |
| "eval_samples_per_second": 2.854, | |
| "eval_steps_per_second": 0.742, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.5228758169934641, | |
| "grad_norm": 57.1182746887207, | |
| "learning_rate": 1.8624338624338625e-05, | |
| "loss": 17.5658, | |
| "mean_token_accuracy": 0.5591249987483025, | |
| "num_tokens": 98240.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5228758169934641, | |
| "eval_loss": 3.730682134628296, | |
| "eval_mean_token_accuracy": 0.5742307580434359, | |
| "eval_num_tokens": 98240.0, | |
| "eval_runtime": 17.4511, | |
| "eval_samples_per_second": 2.865, | |
| "eval_steps_per_second": 0.745, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 58.24352264404297, | |
| "learning_rate": 1.7566137566137566e-05, | |
| "loss": 13.113, | |
| "mean_token_accuracy": 0.6388749912381172, | |
| "num_tokens": 147360.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "eval_loss": 2.7529046535491943, | |
| "eval_mean_token_accuracy": 0.7115384431985708, | |
| "eval_num_tokens": 147360.0, | |
| "eval_runtime": 17.6786, | |
| "eval_samples_per_second": 2.828, | |
| "eval_steps_per_second": 0.735, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.026143790849673, | |
| "grad_norm": 30.73020362854004, | |
| "learning_rate": 1.6507936507936507e-05, | |
| "loss": 8.7369, | |
| "mean_token_accuracy": 0.7555405417004147, | |
| "num_tokens": 192796.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.026143790849673, | |
| "eval_loss": 1.836286187171936, | |
| "eval_mean_token_accuracy": 0.8476923199800345, | |
| "eval_num_tokens": 192796.0, | |
| "eval_runtime": 17.4538, | |
| "eval_samples_per_second": 2.865, | |
| "eval_steps_per_second": 0.745, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.287581699346405, | |
| "grad_norm": 21.440105438232422, | |
| "learning_rate": 1.544973544973545e-05, | |
| "loss": 6.2989, | |
| "mean_token_accuracy": 0.8789999932050705, | |
| "num_tokens": 241916.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.287581699346405, | |
| "eval_loss": 1.3245856761932373, | |
| "eval_mean_token_accuracy": 0.8942307508908786, | |
| "eval_num_tokens": 241916.0, | |
| "eval_runtime": 17.6384, | |
| "eval_samples_per_second": 2.835, | |
| "eval_steps_per_second": 0.737, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.5490196078431373, | |
| "grad_norm": 10.908184051513672, | |
| "learning_rate": 1.4391534391534392e-05, | |
| "loss": 4.5427, | |
| "mean_token_accuracy": 0.9057500049471855, | |
| "num_tokens": 291036.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.5490196078431373, | |
| "eval_loss": 0.9362902045249939, | |
| "eval_mean_token_accuracy": 0.9126923221808213, | |
| "eval_num_tokens": 291036.0, | |
| "eval_runtime": 17.472, | |
| "eval_samples_per_second": 2.862, | |
| "eval_steps_per_second": 0.744, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.8104575163398693, | |
| "grad_norm": 11.274917602539062, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 3.4538, | |
| "mean_token_accuracy": 0.9126250177621842, | |
| "num_tokens": 340156.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.8104575163398693, | |
| "eval_loss": 0.7567419409751892, | |
| "eval_mean_token_accuracy": 0.9103846183189979, | |
| "eval_num_tokens": 340156.0, | |
| "eval_runtime": 17.5544, | |
| "eval_samples_per_second": 2.848, | |
| "eval_steps_per_second": 0.741, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.052287581699346, | |
| "grad_norm": 13.183298110961914, | |
| "learning_rate": 1.2275132275132276e-05, | |
| "loss": 2.5315, | |
| "mean_token_accuracy": 0.9293243321212562, | |
| "num_tokens": 385592.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.052287581699346, | |
| "eval_loss": 0.6147329807281494, | |
| "eval_mean_token_accuracy": 0.9292307771169223, | |
| "eval_num_tokens": 385592.0, | |
| "eval_runtime": 17.582, | |
| "eval_samples_per_second": 2.844, | |
| "eval_steps_per_second": 0.739, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.313725490196078, | |
| "grad_norm": 12.541313171386719, | |
| "learning_rate": 1.1216931216931217e-05, | |
| "loss": 2.2552, | |
| "mean_token_accuracy": 0.9353750020265579, | |
| "num_tokens": 434712.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.313725490196078, | |
| "eval_loss": 0.5065863132476807, | |
| "eval_mean_token_accuracy": 0.9465384529187129, | |
| "eval_num_tokens": 434712.0, | |
| "eval_runtime": 17.6818, | |
| "eval_samples_per_second": 2.828, | |
| "eval_steps_per_second": 0.735, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.57516339869281, | |
| "grad_norm": 7.007925033569336, | |
| "learning_rate": 1.015873015873016e-05, | |
| "loss": 1.9547, | |
| "mean_token_accuracy": 0.9474999904632568, | |
| "num_tokens": 483832.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.57516339869281, | |
| "eval_loss": 0.4519544243812561, | |
| "eval_mean_token_accuracy": 0.9511538331325238, | |
| "eval_num_tokens": 483832.0, | |
| "eval_runtime": 17.5399, | |
| "eval_samples_per_second": 2.851, | |
| "eval_steps_per_second": 0.741, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.8366013071895426, | |
| "grad_norm": 6.267516613006592, | |
| "learning_rate": 9.1005291005291e-06, | |
| "loss": 1.7199, | |
| "mean_token_accuracy": 0.9503749877214431, | |
| "num_tokens": 532952.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.8366013071895426, | |
| "eval_loss": 0.396070659160614, | |
| "eval_mean_token_accuracy": 0.9519230631681589, | |
| "eval_num_tokens": 532952.0, | |
| "eval_runtime": 17.6107, | |
| "eval_samples_per_second": 2.839, | |
| "eval_steps_per_second": 0.738, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 3.0784313725490198, | |
| "grad_norm": 6.098326683044434, | |
| "learning_rate": 8.042328042328043e-06, | |
| "loss": 1.3242, | |
| "mean_token_accuracy": 0.9683783892038706, | |
| "num_tokens": 578388.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.0784313725490198, | |
| "eval_loss": 0.3188876807689667, | |
| "eval_mean_token_accuracy": 0.9742307938062228, | |
| "eval_num_tokens": 578388.0, | |
| "eval_runtime": 17.8412, | |
| "eval_samples_per_second": 2.802, | |
| "eval_steps_per_second": 0.729, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 3.3398692810457518, | |
| "grad_norm": 9.010005950927734, | |
| "learning_rate": 6.984126984126984e-06, | |
| "loss": 1.1048, | |
| "mean_token_accuracy": 0.9745000153779984, | |
| "num_tokens": 627508.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.3398692810457518, | |
| "eval_loss": 0.2367120385169983, | |
| "eval_mean_token_accuracy": 0.9719230899443994, | |
| "eval_num_tokens": 627508.0, | |
| "eval_runtime": 17.8266, | |
| "eval_samples_per_second": 2.805, | |
| "eval_steps_per_second": 0.729, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.6013071895424837, | |
| "grad_norm": 8.258376121520996, | |
| "learning_rate": 5.925925925925926e-06, | |
| "loss": 0.7666, | |
| "mean_token_accuracy": 0.9731250181794167, | |
| "num_tokens": 676628.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.6013071895424837, | |
| "eval_loss": 0.14354808628559113, | |
| "eval_mean_token_accuracy": 0.9723077049622169, | |
| "eval_num_tokens": 676628.0, | |
| "eval_runtime": 17.4766, | |
| "eval_samples_per_second": 2.861, | |
| "eval_steps_per_second": 0.744, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.8627450980392157, | |
| "grad_norm": 7.381052494049072, | |
| "learning_rate": 4.867724867724868e-06, | |
| "loss": 0.496, | |
| "mean_token_accuracy": 0.9710000097751618, | |
| "num_tokens": 725748.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.8627450980392157, | |
| "eval_loss": 0.1069604679942131, | |
| "eval_mean_token_accuracy": 0.975769253877493, | |
| "eval_num_tokens": 725748.0, | |
| "eval_runtime": 17.6354, | |
| "eval_samples_per_second": 2.835, | |
| "eval_steps_per_second": 0.737, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.104575163398692, | |
| "grad_norm": 4.855284690856934, | |
| "learning_rate": 3.80952380952381e-06, | |
| "loss": 0.3519, | |
| "mean_token_accuracy": 0.9745946172121409, | |
| "num_tokens": 771184.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 4.104575163398692, | |
| "eval_loss": 0.08544553071260452, | |
| "eval_mean_token_accuracy": 0.975769253877493, | |
| "eval_num_tokens": 771184.0, | |
| "eval_runtime": 17.4807, | |
| "eval_samples_per_second": 2.86, | |
| "eval_steps_per_second": 0.744, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 4.366013071895424, | |
| "grad_norm": 7.573007106781006, | |
| "learning_rate": 2.7513227513227516e-06, | |
| "loss": 0.2938, | |
| "mean_token_accuracy": 0.97537502348423, | |
| "num_tokens": 820304.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.366013071895424, | |
| "eval_loss": 0.06600862741470337, | |
| "eval_mean_token_accuracy": 0.9753846342747028, | |
| "eval_num_tokens": 820304.0, | |
| "eval_runtime": 17.6346, | |
| "eval_samples_per_second": 2.835, | |
| "eval_steps_per_second": 0.737, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.627450980392156, | |
| "grad_norm": 10.326849937438965, | |
| "learning_rate": 1.6931216931216932e-06, | |
| "loss": 0.24, | |
| "mean_token_accuracy": 0.9745000198483467, | |
| "num_tokens": 869424.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.627450980392156, | |
| "eval_loss": 0.056862689554691315, | |
| "eval_mean_token_accuracy": 0.9765384839131281, | |
| "eval_num_tokens": 869424.0, | |
| "eval_runtime": 17.4728, | |
| "eval_samples_per_second": 2.862, | |
| "eval_steps_per_second": 0.744, | |
| "step": 180 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 195, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.260776445226957e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |