{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 2.5752453804016113, "learning_rate": 0.00019625, "loss": 1.2442, "mean_token_accuracy": 0.7333131074905396, "num_tokens": 8445.0, "step": 10 }, { "epoch": 0.125, "grad_norm": 2.048556327819824, "learning_rate": 0.00019208333333333336, "loss": 0.8951, "mean_token_accuracy": 0.7932066351175309, "num_tokens": 16406.0, "step": 20 }, { "epoch": 0.1875, "grad_norm": 1.801416277885437, "learning_rate": 0.00018833333333333335, "loss": 0.9104, "mean_token_accuracy": 0.7811209857463837, "num_tokens": 26390.0, "step": 30 }, { "epoch": 0.25, "grad_norm": 2.045475721359253, "learning_rate": 0.00018416666666666665, "loss": 0.7925, "mean_token_accuracy": 0.8067008703947067, "num_tokens": 35985.0, "step": 40 }, { "epoch": 0.3125, "grad_norm": 1.7908813953399658, "learning_rate": 0.00018, "loss": 0.7831, "mean_token_accuracy": 0.8088378280401229, "num_tokens": 45075.0, "step": 50 }, { "epoch": 0.375, "grad_norm": 1.80556321144104, "learning_rate": 0.00017583333333333334, "loss": 0.7615, "mean_token_accuracy": 0.8116786539554596, "num_tokens": 54190.0, "step": 60 }, { "epoch": 0.4375, "grad_norm": 1.7150429487228394, "learning_rate": 0.00017166666666666667, "loss": 0.7645, "mean_token_accuracy": 0.814566045999527, "num_tokens": 63103.0, "step": 70 }, { "epoch": 0.5, "grad_norm": 2.1259572505950928, "learning_rate": 0.0001675, "loss": 0.7304, "mean_token_accuracy": 0.8198390424251556, "num_tokens": 71820.0, "step": 80 }, { "epoch": 0.5625, "grad_norm": 1.862838864326477, "learning_rate": 0.00016333333333333334, "loss": 0.8033, "mean_token_accuracy": 0.8004607737064362, "num_tokens": 80005.0, "step": 90 }, { "epoch": 0.625, "grad_norm": 1.9493387937545776, "learning_rate": 0.00015916666666666667, "loss": 0.7443, "mean_token_accuracy": 0.8172103732824325, "num_tokens": 89061.0, "step": 100 }, { "epoch": 0.6875, "grad_norm": 1.6027424335479736, "learning_rate": 0.000155, "loss": 0.6381, "mean_token_accuracy": 0.840108859539032, "num_tokens": 97490.0, "step": 110 }, { "epoch": 0.75, "grad_norm": 1.662534236907959, "learning_rate": 0.00015083333333333333, "loss": 0.6839, "mean_token_accuracy": 0.8289641946554184, "num_tokens": 106341.0, "step": 120 }, { "epoch": 0.8125, "grad_norm": 1.8097492456436157, "learning_rate": 0.00014666666666666666, "loss": 0.6565, "mean_token_accuracy": 0.8321241825819016, "num_tokens": 114882.0, "step": 130 }, { "epoch": 0.875, "grad_norm": 1.62300705909729, "learning_rate": 0.00014250000000000002, "loss": 0.7479, "mean_token_accuracy": 0.8141300559043885, "num_tokens": 124454.0, "step": 140 }, { "epoch": 0.9375, "grad_norm": 1.7561979293823242, "learning_rate": 0.00013833333333333333, "loss": 0.7585, "mean_token_accuracy": 0.8148397266864776, "num_tokens": 134032.0, "step": 150 }, { "epoch": 1.0, "grad_norm": 1.7069191932678223, "learning_rate": 0.00013416666666666666, "loss": 0.6857, "mean_token_accuracy": 0.8240068614482879, "num_tokens": 142641.0, "step": 160 }, { "epoch": 1.0625, "grad_norm": 1.4199870824813843, "learning_rate": 0.00013000000000000002, "loss": 0.5387, "mean_token_accuracy": 0.855182683467865, "num_tokens": 151794.0, "step": 170 }, { "epoch": 1.125, "grad_norm": 1.7815680503845215, "learning_rate": 0.00012583333333333335, "loss": 0.4761, "mean_token_accuracy": 0.8708458811044693, "num_tokens": 160718.0, "step": 180 }, { "epoch": 1.1875, "grad_norm": 1.5737107992172241, "learning_rate": 0.00012166666666666667, "loss": 0.4533, "mean_token_accuracy": 0.8779246717691421, "num_tokens": 169265.0, "step": 190 }, { "epoch": 1.25, "grad_norm": 1.5212680101394653, "learning_rate": 0.00011750000000000001, "loss": 0.4593, "mean_token_accuracy": 0.8714338272809983, "num_tokens": 178575.0, "step": 200 }, { "epoch": 1.3125, "grad_norm": 1.6155683994293213, "learning_rate": 0.00011333333333333334, "loss": 0.4431, "mean_token_accuracy": 0.8786879241466522, "num_tokens": 187096.0, "step": 210 }, { "epoch": 1.375, "grad_norm": 1.5768648386001587, "learning_rate": 0.00010916666666666666, "loss": 0.472, "mean_token_accuracy": 0.8672873705625535, "num_tokens": 196017.0, "step": 220 }, { "epoch": 1.4375, "grad_norm": 1.7005512714385986, "learning_rate": 0.000105, "loss": 0.5119, "mean_token_accuracy": 0.8617127776145935, "num_tokens": 205263.0, "step": 230 }, { "epoch": 1.5, "grad_norm": 1.488906741142273, "learning_rate": 0.00010083333333333334, "loss": 0.4911, "mean_token_accuracy": 0.8672636389732361, "num_tokens": 214207.0, "step": 240 }, { "epoch": 1.5625, "grad_norm": 1.4198739528656006, "learning_rate": 9.666666666666667e-05, "loss": 0.4975, "mean_token_accuracy": 0.8657440841197968, "num_tokens": 223185.0, "step": 250 }, { "epoch": 1.625, "grad_norm": 1.7783927917480469, "learning_rate": 9.250000000000001e-05, "loss": 0.4264, "mean_token_accuracy": 0.8798577606678009, "num_tokens": 231929.0, "step": 260 }, { "epoch": 1.6875, "grad_norm": 1.6650081872940063, "learning_rate": 8.833333333333333e-05, "loss": 0.5051, "mean_token_accuracy": 0.8662203460931778, "num_tokens": 240854.0, "step": 270 }, { "epoch": 1.75, "grad_norm": 1.7360109090805054, "learning_rate": 8.416666666666668e-05, "loss": 0.4374, "mean_token_accuracy": 0.8734647989273071, "num_tokens": 249782.0, "step": 280 }, { "epoch": 1.8125, "grad_norm": 1.6823853254318237, "learning_rate": 8e-05, "loss": 0.5122, "mean_token_accuracy": 0.8641923427581787, "num_tokens": 258703.0, "step": 290 }, { "epoch": 1.875, "grad_norm": 1.5906269550323486, "learning_rate": 7.583333333333334e-05, "loss": 0.4609, "mean_token_accuracy": 0.870806086063385, "num_tokens": 267959.0, "step": 300 }, { "epoch": 1.9375, "grad_norm": 1.8694252967834473, "learning_rate": 7.166666666666667e-05, "loss": 0.453, "mean_token_accuracy": 0.8763782948255538, "num_tokens": 276882.0, "step": 310 }, { "epoch": 2.0, "grad_norm": 2.3819427490234375, "learning_rate": 6.750000000000001e-05, "loss": 0.461, "mean_token_accuracy": 0.870858433842659, "num_tokens": 285282.0, "step": 320 }, { "epoch": 2.0625, "grad_norm": 1.8256394863128662, "learning_rate": 6.333333333333333e-05, "loss": 0.2972, "mean_token_accuracy": 0.9117195069789886, "num_tokens": 293953.0, "step": 330 }, { "epoch": 2.125, "grad_norm": 2.1064252853393555, "learning_rate": 5.916666666666667e-05, "loss": 0.2972, "mean_token_accuracy": 0.9105454385280609, "num_tokens": 302419.0, "step": 340 }, { "epoch": 2.1875, "grad_norm": 1.7595354318618774, "learning_rate": 5.500000000000001e-05, "loss": 0.2999, "mean_token_accuracy": 0.9117335736751556, "num_tokens": 311476.0, "step": 350 }, { "epoch": 2.25, "grad_norm": 1.5239249467849731, "learning_rate": 5.0833333333333333e-05, "loss": 0.2678, "mean_token_accuracy": 0.9190936297178268, "num_tokens": 320231.0, "step": 360 }, { "epoch": 2.3125, "grad_norm": 2.142015218734741, "learning_rate": 4.666666666666667e-05, "loss": 0.2834, "mean_token_accuracy": 0.9148718416690826, "num_tokens": 329086.0, "step": 370 }, { "epoch": 2.375, "grad_norm": 1.3271563053131104, "learning_rate": 4.25e-05, "loss": 0.2828, "mean_token_accuracy": 0.9135244160890579, "num_tokens": 338575.0, "step": 380 }, { "epoch": 2.4375, "grad_norm": 2.033243417739868, "learning_rate": 3.8333333333333334e-05, "loss": 0.2644, "mean_token_accuracy": 0.9237385660409927, "num_tokens": 347842.0, "step": 390 }, { "epoch": 2.5, "grad_norm": 2.177577018737793, "learning_rate": 3.4166666666666666e-05, "loss": 0.2666, "mean_token_accuracy": 0.9173466831445694, "num_tokens": 356622.0, "step": 400 }, { "epoch": 2.5625, "grad_norm": 1.9091131687164307, "learning_rate": 3e-05, "loss": 0.2547, "mean_token_accuracy": 0.9216690093278885, "num_tokens": 365430.0, "step": 410 }, { "epoch": 2.625, "grad_norm": 1.8569989204406738, "learning_rate": 2.5833333333333336e-05, "loss": 0.2641, "mean_token_accuracy": 0.920384281873703, "num_tokens": 374989.0, "step": 420 }, { "epoch": 2.6875, "grad_norm": 2.433670997619629, "learning_rate": 2.2083333333333333e-05, "loss": 0.2929, "mean_token_accuracy": 0.9130927324295044, "num_tokens": 383763.0, "step": 430 }, { "epoch": 2.75, "grad_norm": 1.7513527870178223, "learning_rate": 1.7916666666666667e-05, "loss": 0.265, "mean_token_accuracy": 0.9221676260232925, "num_tokens": 392872.0, "step": 440 }, { "epoch": 2.8125, "grad_norm": 2.320101737976074, "learning_rate": 1.3750000000000002e-05, "loss": 0.288, "mean_token_accuracy": 0.915841493010521, "num_tokens": 401207.0, "step": 450 }, { "epoch": 2.875, "grad_norm": 1.8214130401611328, "learning_rate": 9.583333333333334e-06, "loss": 0.2768, "mean_token_accuracy": 0.9163581758737565, "num_tokens": 410106.0, "step": 460 }, { "epoch": 2.9375, "grad_norm": 2.07955265045166, "learning_rate": 5.416666666666667e-06, "loss": 0.2594, "mean_token_accuracy": 0.924983486533165, "num_tokens": 419029.0, "step": 470 }, { "epoch": 3.0, "grad_norm": 2.6197588443756104, "learning_rate": 1.25e-06, "loss": 0.2769, "mean_token_accuracy": 0.9173476547002792, "num_tokens": 427923.0, "step": 480 } ], "logging_steps": 10, "max_steps": 480, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.081552109551616e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }