| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.990490124359912, | |
| "eval_steps": 500, | |
| "global_step": 510, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.058522311631309436, | |
| "grad_norm": 1.5647390529813434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8022, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11704462326261887, | |
| "grad_norm": 2.4298906740936874, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7306, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1755669348939283, | |
| "grad_norm": 1.1278550883615037, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7122, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.23408924652523774, | |
| "grad_norm": 1.1246574687423805, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6975, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.29261155815654716, | |
| "grad_norm": 1.0811775432928663, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3511338697878566, | |
| "grad_norm": 0.8345121386846462, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6822, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.40965618141916604, | |
| "grad_norm": 0.502423577542533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6631, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4681784930504755, | |
| "grad_norm": 0.3206403744702351, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6566, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5267008046817849, | |
| "grad_norm": 0.3373586439028653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6613, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5852231163130943, | |
| "grad_norm": 0.27440465078078524, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6497, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6437454279444038, | |
| "grad_norm": 0.25729298157504654, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6506, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7022677395757132, | |
| "grad_norm": 0.2774645357576214, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6479, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7607900512070227, | |
| "grad_norm": 0.2750786001903561, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6509, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8193123628383321, | |
| "grad_norm": 0.3018000353503424, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6532, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8778346744696416, | |
| "grad_norm": 0.2651836266343764, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6407, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.936356986100951, | |
| "grad_norm": 0.2621590800809169, | |
| "learning_rate": 5e-06, | |
| "loss": 0.646, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9948792977322605, | |
| "grad_norm": 0.274425449696112, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9948792977322605, | |
| "eval_loss": 0.6455708742141724, | |
| "eval_runtime": 172.3102, | |
| "eval_samples_per_second": 53.444, | |
| "eval_steps_per_second": 0.418, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0563277249451353, | |
| "grad_norm": 0.32659804954217203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6596, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1148500365764447, | |
| "grad_norm": 0.29235225467356285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6219, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1733723482077543, | |
| "grad_norm": 0.2686911846272285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6246, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2318946598390637, | |
| "grad_norm": 0.2689856133611371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.618, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.290416971470373, | |
| "grad_norm": 0.26872283242131406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6202, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.3489392831016827, | |
| "grad_norm": 0.301091252809549, | |
| "learning_rate": 5e-06, | |
| "loss": 0.618, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4074615947329918, | |
| "grad_norm": 0.2920775430394786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6142, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.4659839063643014, | |
| "grad_norm": 0.2456820075171799, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6155, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5245062179956108, | |
| "grad_norm": 0.2938378044663654, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6187, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.5830285296269202, | |
| "grad_norm": 0.32438651891226156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6219, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.6415508412582298, | |
| "grad_norm": 0.25545801371272864, | |
| "learning_rate": 5e-06, | |
| "loss": 0.616, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.700073152889539, | |
| "grad_norm": 0.26294073057220163, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6127, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.7585954645208486, | |
| "grad_norm": 0.26462245389002803, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6168, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.817117776152158, | |
| "grad_norm": 0.2847262707293318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6172, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.8756400877834674, | |
| "grad_norm": 0.2669714428041422, | |
| "learning_rate": 5e-06, | |
| "loss": 0.614, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.934162399414777, | |
| "grad_norm": 0.25457144598514, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6166, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.9926847110460864, | |
| "grad_norm": 0.2608967910015083, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6126, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9926847110460864, | |
| "eval_loss": 0.6363422274589539, | |
| "eval_runtime": 172.306, | |
| "eval_samples_per_second": 53.446, | |
| "eval_steps_per_second": 0.418, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.0541331382589614, | |
| "grad_norm": 0.271883921683299, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6297, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.1126554498902705, | |
| "grad_norm": 0.24729272080119263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5897, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.17117776152158, | |
| "grad_norm": 0.27092891797600144, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5946, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.2297000731528893, | |
| "grad_norm": 0.3032127102208398, | |
| "learning_rate": 5e-06, | |
| "loss": 0.594, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.288222384784199, | |
| "grad_norm": 0.25853126440367846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.588, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.3467446964155085, | |
| "grad_norm": 0.3077689025344159, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5943, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.4052670080468177, | |
| "grad_norm": 0.2827487146132787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5933, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.4637893196781273, | |
| "grad_norm": 0.2519214403191199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5898, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.522311631309437, | |
| "grad_norm": 0.2751668540595721, | |
| "learning_rate": 5e-06, | |
| "loss": 0.588, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.580833942940746, | |
| "grad_norm": 0.2530698336402752, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5883, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.6393562545720557, | |
| "grad_norm": 0.25471213766207895, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5951, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.6978785662033653, | |
| "grad_norm": 0.29077003251470107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5914, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.7564008778346745, | |
| "grad_norm": 0.30152118910674564, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5917, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.8149231894659836, | |
| "grad_norm": 0.26709177034419973, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5923, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.8734455010972932, | |
| "grad_norm": 0.23395614388611538, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5888, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.931967812728603, | |
| "grad_norm": 0.28669880402317394, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5865, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.990490124359912, | |
| "grad_norm": 0.2531977873715163, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5898, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.990490124359912, | |
| "eval_loss": 0.63369220495224, | |
| "eval_runtime": 171.9297, | |
| "eval_samples_per_second": 53.563, | |
| "eval_steps_per_second": 0.419, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.990490124359912, | |
| "step": 510, | |
| "total_flos": 2138433883471872.0, | |
| "train_loss": 0.6292863135244332, | |
| "train_runtime": 27713.9731, | |
| "train_samples_per_second": 18.939, | |
| "train_steps_per_second": 0.018 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 510, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2138433883471872.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |