{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.364877161055505, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022747952684258416, "grad_norm": 18.962158203125, "learning_rate": 2.4616154873164223e-05, "loss": 2.8908, "step": 25 }, { "epoch": 0.04549590536851683, "grad_norm": 9.855137825012207, "learning_rate": 2.4198931909212282e-05, "loss": 2.5971, "step": 50 }, { "epoch": 0.06824385805277525, "grad_norm": 10.13984203338623, "learning_rate": 2.3781708945260348e-05, "loss": 2.529, "step": 75 }, { "epoch": 0.09099181073703366, "grad_norm": 8.192861557006836, "learning_rate": 2.3364485981308414e-05, "loss": 2.4313, "step": 100 }, { "epoch": 0.11373976342129208, "grad_norm": 9.359848976135254, "learning_rate": 2.2947263017356476e-05, "loss": 2.4519, "step": 125 }, { "epoch": 0.1364877161055505, "grad_norm": 9.675088882446289, "learning_rate": 2.2530040053404542e-05, "loss": 2.3522, "step": 150 }, { "epoch": 0.1592356687898089, "grad_norm": 7.448574066162109, "learning_rate": 2.2112817089452604e-05, "loss": 2.2756, "step": 175 }, { "epoch": 0.18198362147406733, "grad_norm": 8.285670280456543, "learning_rate": 2.1695594125500667e-05, "loss": 2.4835, "step": 200 }, { "epoch": 0.20473157415832574, "grad_norm": 10.987030982971191, "learning_rate": 2.1278371161548733e-05, "loss": 2.4054, "step": 225 }, { "epoch": 0.22747952684258416, "grad_norm": 9.691205978393555, "learning_rate": 2.08611481975968e-05, "loss": 2.3081, "step": 250 }, { "epoch": 0.2502274795268426, "grad_norm": 9.08532428741455, "learning_rate": 2.044392523364486e-05, "loss": 2.3223, "step": 275 }, { "epoch": 0.272975432211101, "grad_norm": 7.425185680389404, "learning_rate": 2.0026702269692923e-05, "loss": 2.388, "step": 300 }, { "epoch": 0.29572338489535943, "grad_norm": 8.487403869628906, "learning_rate": 1.960947930574099e-05, "loss": 2.2745, "step": 325 }, { "epoch": 0.3184713375796178, "grad_norm": 11.06141471862793, "learning_rate": 1.9192256341789052e-05, "loss": 2.2413, "step": 350 }, { "epoch": 0.34121929026387626, "grad_norm": 8.929823875427246, "learning_rate": 1.8775033377837118e-05, "loss": 2.2775, "step": 375 }, { "epoch": 0.36396724294813465, "grad_norm": 6.374183654785156, "learning_rate": 1.8357810413885184e-05, "loss": 2.2436, "step": 400 }, { "epoch": 0.3867151956323931, "grad_norm": 9.918680191040039, "learning_rate": 1.7940587449933243e-05, "loss": 2.2926, "step": 425 }, { "epoch": 0.4094631483166515, "grad_norm": 6.771628379821777, "learning_rate": 1.752336448598131e-05, "loss": 2.2754, "step": 450 }, { "epoch": 0.4322111010009099, "grad_norm": 9.636200904846191, "learning_rate": 1.7106141522029374e-05, "loss": 2.4094, "step": 475 }, { "epoch": 0.4549590536851683, "grad_norm": 10.16539478302002, "learning_rate": 1.6688918558077437e-05, "loss": 2.3422, "step": 500 }, { "epoch": 0.4549590536851683, "eval_loss": 2.135864496231079, "eval_runtime": 18.7379, "eval_samples_per_second": 5.87, "eval_steps_per_second": 0.747, "step": 500 }, { "epoch": 0.47770700636942676, "grad_norm": 8.440470695495605, "learning_rate": 1.6271695594125503e-05, "loss": 2.3016, "step": 525 }, { "epoch": 0.5004549590536852, "grad_norm": 10.333203315734863, "learning_rate": 1.585447263017357e-05, "loss": 2.2732, "step": 550 }, { "epoch": 0.5232029117379435, "grad_norm": 8.159846305847168, "learning_rate": 1.5437249666221628e-05, "loss": 2.1742, "step": 575 }, { "epoch": 0.545950864422202, "grad_norm": 7.440310955047607, "learning_rate": 1.5020026702269693e-05, "loss": 2.3123, "step": 600 }, { "epoch": 0.5686988171064604, "grad_norm": 7.5925374031066895, "learning_rate": 1.4602803738317758e-05, "loss": 2.262, "step": 625 }, { "epoch": 0.5914467697907189, "grad_norm": 7.50217342376709, "learning_rate": 1.4185580774365822e-05, "loss": 2.1893, "step": 650 }, { "epoch": 0.6141947224749773, "grad_norm": 7.763234615325928, "learning_rate": 1.3768357810413888e-05, "loss": 2.2997, "step": 675 }, { "epoch": 0.6369426751592356, "grad_norm": 8.378331184387207, "learning_rate": 1.3351134846461948e-05, "loss": 2.1408, "step": 700 }, { "epoch": 0.6596906278434941, "grad_norm": 9.364808082580566, "learning_rate": 1.2933911882510014e-05, "loss": 2.3418, "step": 725 }, { "epoch": 0.6824385805277525, "grad_norm": 7.610098838806152, "learning_rate": 1.2516688918558078e-05, "loss": 2.2467, "step": 750 }, { "epoch": 0.705186533212011, "grad_norm": 7.710592746734619, "learning_rate": 1.2099465954606141e-05, "loss": 2.3844, "step": 775 }, { "epoch": 0.7279344858962693, "grad_norm": 7.574198246002197, "learning_rate": 1.1682242990654207e-05, "loss": 2.3413, "step": 800 }, { "epoch": 0.7506824385805277, "grad_norm": 10.453964233398438, "learning_rate": 1.1265020026702271e-05, "loss": 2.3048, "step": 825 }, { "epoch": 0.7734303912647862, "grad_norm": 8.253402709960938, "learning_rate": 1.0847797062750333e-05, "loss": 2.2046, "step": 850 }, { "epoch": 0.7961783439490446, "grad_norm": 8.736198425292969, "learning_rate": 1.04305740987984e-05, "loss": 2.2299, "step": 875 }, { "epoch": 0.818926296633303, "grad_norm": 8.582794189453125, "learning_rate": 1.0013351134846462e-05, "loss": 2.2477, "step": 900 }, { "epoch": 0.8416742493175614, "grad_norm": 8.511619567871094, "learning_rate": 9.596128170894526e-06, "loss": 2.1604, "step": 925 }, { "epoch": 0.8644222020018199, "grad_norm": 7.394681930541992, "learning_rate": 9.178905206942592e-06, "loss": 2.2107, "step": 950 }, { "epoch": 0.8871701546860783, "grad_norm": 9.960970878601074, "learning_rate": 8.761682242990654e-06, "loss": 2.2325, "step": 975 }, { "epoch": 0.9099181073703366, "grad_norm": 8.213104248046875, "learning_rate": 8.344459279038718e-06, "loss": 2.3189, "step": 1000 }, { "epoch": 0.9099181073703366, "eval_loss": 1.8801201581954956, "eval_runtime": 18.4977, "eval_samples_per_second": 5.947, "eval_steps_per_second": 0.757, "step": 1000 }, { "epoch": 0.9326660600545951, "grad_norm": 7.146160125732422, "learning_rate": 7.927236315086784e-06, "loss": 2.2208, "step": 1025 }, { "epoch": 0.9554140127388535, "grad_norm": 10.675107955932617, "learning_rate": 7.510013351134847e-06, "loss": 2.3002, "step": 1050 }, { "epoch": 0.978161965423112, "grad_norm": 7.811581611633301, "learning_rate": 7.092790387182911e-06, "loss": 2.2749, "step": 1075 }, { "epoch": 1.0009099181073704, "grad_norm": 5.786709785461426, "learning_rate": 6.675567423230974e-06, "loss": 2.1707, "step": 1100 }, { "epoch": 1.0236578707916288, "grad_norm": 8.969910621643066, "learning_rate": 6.258344459279039e-06, "loss": 1.7638, "step": 1125 }, { "epoch": 1.046405823475887, "grad_norm": 7.258237361907959, "learning_rate": 5.841121495327103e-06, "loss": 1.8112, "step": 1150 }, { "epoch": 1.0691537761601455, "grad_norm": 8.390810012817383, "learning_rate": 5.423898531375167e-06, "loss": 1.7777, "step": 1175 }, { "epoch": 1.091901728844404, "grad_norm": 7.946865081787109, "learning_rate": 5.006675567423231e-06, "loss": 1.7933, "step": 1200 }, { "epoch": 1.1146496815286624, "grad_norm": 8.631255149841309, "learning_rate": 4.589452603471296e-06, "loss": 1.7064, "step": 1225 }, { "epoch": 1.1373976342129208, "grad_norm": 11.77323055267334, "learning_rate": 4.172229639519359e-06, "loss": 1.8455, "step": 1250 }, { "epoch": 1.1601455868971793, "grad_norm": 8.695523262023926, "learning_rate": 3.7550066755674234e-06, "loss": 1.7634, "step": 1275 }, { "epoch": 1.1828935395814377, "grad_norm": 8.56210708618164, "learning_rate": 3.337783711615487e-06, "loss": 1.7213, "step": 1300 }, { "epoch": 1.2056414922656962, "grad_norm": 9.016804695129395, "learning_rate": 2.9205607476635517e-06, "loss": 1.7588, "step": 1325 }, { "epoch": 1.2283894449499546, "grad_norm": 10.927498817443848, "learning_rate": 2.5033377837116154e-06, "loss": 1.8489, "step": 1350 }, { "epoch": 1.251137397634213, "grad_norm": 8.833426475524902, "learning_rate": 2.0861148197596796e-06, "loss": 1.764, "step": 1375 }, { "epoch": 1.2738853503184713, "grad_norm": 7.1187262535095215, "learning_rate": 1.6688918558077436e-06, "loss": 1.8312, "step": 1400 }, { "epoch": 1.2966333030027297, "grad_norm": 13.539063453674316, "learning_rate": 1.2516688918558077e-06, "loss": 1.6446, "step": 1425 }, { "epoch": 1.3193812556869882, "grad_norm": 9.34390640258789, "learning_rate": 8.344459279038718e-07, "loss": 1.812, "step": 1450 }, { "epoch": 1.3421292083712466, "grad_norm": 11.918546676635742, "learning_rate": 4.172229639519359e-07, "loss": 1.8513, "step": 1475 }, { "epoch": 1.364877161055505, "grad_norm": 10.514022827148438, "learning_rate": 0.0, "loss": 1.7566, "step": 1500 }, { "epoch": 1.364877161055505, "eval_loss": 1.6919403076171875, "eval_runtime": 18.5198, "eval_samples_per_second": 5.94, "eval_steps_per_second": 0.756, "step": 1500 } ], "logging_steps": 25, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.316134264491213e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }