| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 164, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1764705882352942e-05, | |
| "loss": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.882352941176471e-05, | |
| "loss": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011764705882352942, | |
| "loss": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017647058823529413, | |
| "loss": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019979453927503364, | |
| "loss": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019854200213552424, | |
| "loss": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019616534368410365, | |
| "loss": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001926916757346022, | |
| "loss": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001881606242748009, | |
| "loss": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001826238774315995, | |
| "loss": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017614459583691346, | |
| "loss": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016879669212057187, | |
| "loss": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016066398774942554, | |
| "loss": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015183925683105254, | |
| "loss": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014242316778990372, | |
| "loss": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013252313498875472, | |
| "loss": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012225209339563145, | |
| "loss": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001117272102742402, | |
| "loss": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010106854859433734, | |
| "loss": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.039769740923183e-05, | |
| "loss": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.98363848244367e-05, | |
| "loss": 0.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.1135515643015182, | |
| "learning_rate": 6.950508938007729e-05, | |
| "loss": 1.0139, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.11332845930420761, | |
| "learning_rate": 5.952166568776062e-05, | |
| "loss": 0.9484, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.09969915846105798, | |
| "learning_rate": 5.000000000000002e-05, | |
| "loss": 0.9596, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.102905226481938, | |
| "learning_rate": 4.1048711048834033e-05, | |
| "loss": 0.9166, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.10339973049955652, | |
| "learning_rate": 3.276991097386831e-05, | |
| "loss": 0.917, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.09023053789634858, | |
| "learning_rate": 2.525804047449648e-05, | |
| "loss": 0.9277, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.09169398706127865, | |
| "learning_rate": 1.8598791474341514e-05, | |
| "loss": 0.9077, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.08727749348203608, | |
| "learning_rate": 1.286812958766106e-05, | |
| "loss": 0.9067, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.0954839431246452, | |
| "learning_rate": 8.131427538964164e-06, | |
| "loss": 0.91, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.09504819554849074, | |
| "learning_rate": 4.442719421385922e-06, | |
| "loss": 0.9206, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.08832758387266068, | |
| "learning_rate": 1.8440843008934561e-06, | |
| "loss": 0.9107, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.09005484999206638, | |
| "learning_rate": 3.651661978793075e-07, | |
| "loss": 0.9393, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.1427615880966187, | |
| "eval_runtime": 200.7847, | |
| "eval_samples_per_second": 11.505, | |
| "eval_steps_per_second": 0.722, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 164, | |
| "total_flos": 2050416313368576.0, | |
| "train_loss": 0.3630331289477465, | |
| "train_runtime": 1302.3826, | |
| "train_samples_per_second": 16.102, | |
| "train_steps_per_second": 0.126 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 164, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "total_flos": 2050416313368576.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |