{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.029098898061104047, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002909889806110405, "grad_norm": 3.3640072345733643, "learning_rate": 0.0002, "loss": 1.2361, "step": 10 }, { "epoch": 0.000581977961222081, "grad_norm": 0.2830420136451721, "learning_rate": 0.0002, "loss": 0.1232, "step": 20 }, { "epoch": 0.0008729669418331215, "grad_norm": 0.3788599669933319, "learning_rate": 0.0002, "loss": 0.0874, "step": 30 }, { "epoch": 0.001163955922444162, "grad_norm": 0.9566423892974854, "learning_rate": 0.0002, "loss": 0.1099, "step": 40 }, { "epoch": 0.0014549449030552023, "grad_norm": 0.5953503847122192, "learning_rate": 0.0002, "loss": 0.0727, "step": 50 }, { "epoch": 0.001745933883666243, "grad_norm": 0.3039131164550781, "learning_rate": 0.0002, "loss": 0.0575, "step": 60 }, { "epoch": 0.0020369228642772835, "grad_norm": 0.27436479926109314, "learning_rate": 0.0002, "loss": 0.0511, "step": 70 }, { "epoch": 0.002327911844888324, "grad_norm": 0.22409753501415253, "learning_rate": 0.0002, "loss": 0.0495, "step": 80 }, { "epoch": 0.0026189008254993646, "grad_norm": 0.5258105397224426, "learning_rate": 0.0002, "loss": 0.0447, "step": 90 }, { "epoch": 0.0029098898061104047, "grad_norm": 0.3420485854148865, "learning_rate": 0.0002, "loss": 0.0447, "step": 100 }, { "epoch": 0.0032008787867214453, "grad_norm": 0.18883126974105835, "learning_rate": 0.0002, "loss": 0.0437, "step": 110 }, { "epoch": 0.003491867767332486, "grad_norm": 0.09849688410758972, "learning_rate": 0.0002, "loss": 0.0428, "step": 120 }, { "epoch": 0.0037828567479435264, "grad_norm": 0.5861080884933472, "learning_rate": 0.0002, "loss": 0.0416, "step": 130 }, { "epoch": 0.004073845728554567, "grad_norm": 0.8478333353996277, "learning_rate": 0.0002, "loss": 0.0415, "step": 140 }, { "epoch": 0.004364834709165607, "grad_norm": 0.6563957333564758, "learning_rate": 0.0002, "loss": 0.0416, "step": 150 }, { "epoch": 0.004655823689776648, "grad_norm": 0.1496465653181076, "learning_rate": 0.0002, "loss": 0.04, "step": 160 }, { "epoch": 0.004946812670387688, "grad_norm": 0.7356327176094055, "learning_rate": 0.0002, "loss": 0.0406, "step": 170 }, { "epoch": 0.005237801650998729, "grad_norm": 0.5485235452651978, "learning_rate": 0.0002, "loss": 0.0415, "step": 180 }, { "epoch": 0.005528790631609769, "grad_norm": 0.28617948293685913, "learning_rate": 0.0002, "loss": 0.04, "step": 190 }, { "epoch": 0.005819779612220809, "grad_norm": 0.7351231575012207, "learning_rate": 0.0002, "loss": 0.0462, "step": 200 }, { "epoch": 0.00611076859283185, "grad_norm": 0.6597175598144531, "learning_rate": 0.0002, "loss": 0.042, "step": 210 }, { "epoch": 0.0064017575734428905, "grad_norm": 0.5418401956558228, "learning_rate": 0.0002, "loss": 0.0425, "step": 220 }, { "epoch": 0.0066927465540539315, "grad_norm": 0.4611615836620331, "learning_rate": 0.0002, "loss": 0.0409, "step": 230 }, { "epoch": 0.006983735534664972, "grad_norm": 0.039530955255031586, "learning_rate": 0.0002, "loss": 0.0414, "step": 240 }, { "epoch": 0.007274724515276012, "grad_norm": 0.03446557745337486, "learning_rate": 0.0002, "loss": 0.0393, "step": 250 }, { "epoch": 0.007565713495887053, "grad_norm": 0.7747415900230408, "learning_rate": 0.0002, "loss": 0.0419, "step": 260 }, { "epoch": 0.007856702476498093, "grad_norm": 0.3428023159503937, "learning_rate": 0.0002, "loss": 0.0423, "step": 270 }, { "epoch": 0.008147691457109134, "grad_norm": 0.2668132781982422, "learning_rate": 0.0002, "loss": 0.0404, "step": 280 }, { "epoch": 0.008438680437720175, "grad_norm": 0.06787655502557755, "learning_rate": 0.0002, "loss": 0.0373, "step": 290 }, { "epoch": 0.008729669418331214, "grad_norm": 0.17109806835651398, "learning_rate": 0.0002, "loss": 0.0371, "step": 300 }, { "epoch": 0.009020658398942255, "grad_norm": 0.1489395946264267, "learning_rate": 0.0002, "loss": 0.0377, "step": 310 }, { "epoch": 0.009311647379553296, "grad_norm": 0.20444560050964355, "learning_rate": 0.0002, "loss": 0.0385, "step": 320 }, { "epoch": 0.009602636360164335, "grad_norm": 1.251767873764038, "learning_rate": 0.0002, "loss": 0.0395, "step": 330 }, { "epoch": 0.009893625340775376, "grad_norm": 0.17964421212673187, "learning_rate": 0.0002, "loss": 0.0403, "step": 340 }, { "epoch": 0.010184614321386417, "grad_norm": 0.20001742243766785, "learning_rate": 0.0002, "loss": 0.0374, "step": 350 }, { "epoch": 0.010475603301997458, "grad_norm": 0.07885689288377762, "learning_rate": 0.0002, "loss": 0.0366, "step": 360 }, { "epoch": 0.010766592282608498, "grad_norm": 0.1755530834197998, "learning_rate": 0.0002, "loss": 0.0366, "step": 370 }, { "epoch": 0.011057581263219539, "grad_norm": 0.24720287322998047, "learning_rate": 0.0002, "loss": 0.036, "step": 380 }, { "epoch": 0.01134857024383058, "grad_norm": 0.13627253472805023, "learning_rate": 0.0002, "loss": 0.0384, "step": 390 }, { "epoch": 0.011639559224441619, "grad_norm": 0.10238471627235413, "learning_rate": 0.0002, "loss": 0.037, "step": 400 }, { "epoch": 0.01193054820505266, "grad_norm": 0.1790493279695511, "learning_rate": 0.0002, "loss": 0.036, "step": 410 }, { "epoch": 0.0122215371856637, "grad_norm": 0.8142262101173401, "learning_rate": 0.0002, "loss": 0.0366, "step": 420 }, { "epoch": 0.012512526166274742, "grad_norm": 0.3142533600330353, "learning_rate": 0.0002, "loss": 0.0373, "step": 430 }, { "epoch": 0.012803515146885781, "grad_norm": 0.5743248462677002, "learning_rate": 0.0002, "loss": 0.0364, "step": 440 }, { "epoch": 0.013094504127496822, "grad_norm": 0.10562433302402496, "learning_rate": 0.0002, "loss": 0.0357, "step": 450 }, { "epoch": 0.013385493108107863, "grad_norm": 0.1285354048013687, "learning_rate": 0.0002, "loss": 0.0348, "step": 460 }, { "epoch": 0.013676482088718902, "grad_norm": 0.31671035289764404, "learning_rate": 0.0002, "loss": 0.0367, "step": 470 }, { "epoch": 0.013967471069329943, "grad_norm": 0.10281776636838913, "learning_rate": 0.0002, "loss": 0.0367, "step": 480 }, { "epoch": 0.014258460049940984, "grad_norm": 0.10469332337379456, "learning_rate": 0.0002, "loss": 0.0354, "step": 490 }, { "epoch": 0.014549449030552023, "grad_norm": 0.04262165352702141, "learning_rate": 0.0002, "loss": 0.0358, "step": 500 }, { "epoch": 0.014840438011163064, "grad_norm": 0.08256979286670685, "learning_rate": 0.0002, "loss": 0.0352, "step": 510 }, { "epoch": 0.015131426991774105, "grad_norm": 0.2147534042596817, "learning_rate": 0.0002, "loss": 0.0352, "step": 520 }, { "epoch": 0.015422415972385146, "grad_norm": 0.7227026224136353, "learning_rate": 0.0002, "loss": 0.0367, "step": 530 }, { "epoch": 0.015713404952996186, "grad_norm": 0.3186182975769043, "learning_rate": 0.0002, "loss": 0.0366, "step": 540 }, { "epoch": 0.01600439393360723, "grad_norm": 0.5129309296607971, "learning_rate": 0.0002, "loss": 0.036, "step": 550 }, { "epoch": 0.016295382914218268, "grad_norm": 0.3547574281692505, "learning_rate": 0.0002, "loss": 0.0363, "step": 560 }, { "epoch": 0.016586371894829307, "grad_norm": 0.29844892024993896, "learning_rate": 0.0002, "loss": 0.0369, "step": 570 }, { "epoch": 0.01687736087544035, "grad_norm": 0.25678157806396484, "learning_rate": 0.0002, "loss": 0.0352, "step": 580 }, { "epoch": 0.01716834985605139, "grad_norm": 0.07419384270906448, "learning_rate": 0.0002, "loss": 0.0344, "step": 590 }, { "epoch": 0.017459338836662428, "grad_norm": 0.15620607137680054, "learning_rate": 0.0002, "loss": 0.0343, "step": 600 }, { "epoch": 0.01775032781727347, "grad_norm": 0.17426913976669312, "learning_rate": 0.0002, "loss": 0.0326, "step": 610 }, { "epoch": 0.01804131679788451, "grad_norm": 0.18652600049972534, "learning_rate": 0.0002, "loss": 0.0358, "step": 620 }, { "epoch": 0.01833230577849555, "grad_norm": 0.5866808295249939, "learning_rate": 0.0002, "loss": 0.0364, "step": 630 }, { "epoch": 0.018623294759106592, "grad_norm": 0.5470107793807983, "learning_rate": 0.0002, "loss": 0.0385, "step": 640 }, { "epoch": 0.01891428373971763, "grad_norm": 0.4430047273635864, "learning_rate": 0.0002, "loss": 0.0355, "step": 650 }, { "epoch": 0.01920527272032867, "grad_norm": 0.44153594970703125, "learning_rate": 0.0002, "loss": 0.0374, "step": 660 }, { "epoch": 0.019496261700939713, "grad_norm": 0.2025349885225296, "learning_rate": 0.0002, "loss": 0.0361, "step": 670 }, { "epoch": 0.019787250681550753, "grad_norm": 0.05006701499223709, "learning_rate": 0.0002, "loss": 0.0352, "step": 680 }, { "epoch": 0.020078239662161792, "grad_norm": 0.15291444957256317, "learning_rate": 0.0002, "loss": 0.0339, "step": 690 }, { "epoch": 0.020369228642772835, "grad_norm": 0.20080982148647308, "learning_rate": 0.0002, "loss": 0.034, "step": 700 }, { "epoch": 0.020660217623383874, "grad_norm": 0.1934683471918106, "learning_rate": 0.0002, "loss": 0.0338, "step": 710 }, { "epoch": 0.020951206603994917, "grad_norm": 0.2093890905380249, "learning_rate": 0.0002, "loss": 0.033, "step": 720 }, { "epoch": 0.021242195584605956, "grad_norm": 0.17411717772483826, "learning_rate": 0.0002, "loss": 0.0329, "step": 730 }, { "epoch": 0.021533184565216995, "grad_norm": 0.06554729491472244, "learning_rate": 0.0002, "loss": 0.0328, "step": 740 }, { "epoch": 0.021824173545828038, "grad_norm": 0.3035508692264557, "learning_rate": 0.0002, "loss": 0.0345, "step": 750 }, { "epoch": 0.022115162526439077, "grad_norm": 0.1284075379371643, "learning_rate": 0.0002, "loss": 0.0344, "step": 760 }, { "epoch": 0.022406151507050116, "grad_norm": 0.06972914189100266, "learning_rate": 0.0002, "loss": 0.0326, "step": 770 }, { "epoch": 0.02269714048766116, "grad_norm": 0.2625221908092499, "learning_rate": 0.0002, "loss": 0.0343, "step": 780 }, { "epoch": 0.0229881294682722, "grad_norm": 0.2056276947259903, "learning_rate": 0.0002, "loss": 0.0341, "step": 790 }, { "epoch": 0.023279118448883238, "grad_norm": 0.06602438539266586, "learning_rate": 0.0002, "loss": 0.0331, "step": 800 }, { "epoch": 0.02357010742949428, "grad_norm": 0.1302807331085205, "learning_rate": 0.0002, "loss": 0.0328, "step": 810 }, { "epoch": 0.02386109641010532, "grad_norm": 0.07038327306509018, "learning_rate": 0.0002, "loss": 0.0338, "step": 820 }, { "epoch": 0.02415208539071636, "grad_norm": 0.3151911199092865, "learning_rate": 0.0002, "loss": 0.0353, "step": 830 }, { "epoch": 0.0244430743713274, "grad_norm": 0.2942112982273102, "learning_rate": 0.0002, "loss": 0.0336, "step": 840 }, { "epoch": 0.02473406335193844, "grad_norm": 0.09775586426258087, "learning_rate": 0.0002, "loss": 0.0331, "step": 850 }, { "epoch": 0.025025052332549483, "grad_norm": 0.06825686991214752, "learning_rate": 0.0002, "loss": 0.0338, "step": 860 }, { "epoch": 0.025316041313160523, "grad_norm": 0.08698020130395889, "learning_rate": 0.0002, "loss": 0.0329, "step": 870 }, { "epoch": 0.025607030293771562, "grad_norm": 0.29394668340682983, "learning_rate": 0.0002, "loss": 0.0328, "step": 880 }, { "epoch": 0.025898019274382605, "grad_norm": 0.2138691395521164, "learning_rate": 0.0002, "loss": 0.0327, "step": 890 }, { "epoch": 0.026189008254993644, "grad_norm": 0.22760023176670074, "learning_rate": 0.0002, "loss": 0.0349, "step": 900 }, { "epoch": 0.026479997235604683, "grad_norm": 0.07050047069787979, "learning_rate": 0.0002, "loss": 0.0327, "step": 910 }, { "epoch": 0.026770986216215726, "grad_norm": 0.0632275640964508, "learning_rate": 0.0002, "loss": 0.0332, "step": 920 }, { "epoch": 0.027061975196826765, "grad_norm": 0.2537945508956909, "learning_rate": 0.0002, "loss": 0.0334, "step": 930 }, { "epoch": 0.027352964177437804, "grad_norm": 0.17872551083564758, "learning_rate": 0.0002, "loss": 0.0339, "step": 940 }, { "epoch": 0.027643953158048847, "grad_norm": 0.1240101158618927, "learning_rate": 0.0002, "loss": 0.0328, "step": 950 }, { "epoch": 0.027934942138659886, "grad_norm": 0.24408769607543945, "learning_rate": 0.0002, "loss": 0.0337, "step": 960 }, { "epoch": 0.028225931119270926, "grad_norm": 0.06075837463140488, "learning_rate": 0.0002, "loss": 0.0327, "step": 970 }, { "epoch": 0.02851692009988197, "grad_norm": 0.09202170372009277, "learning_rate": 0.0002, "loss": 0.0326, "step": 980 }, { "epoch": 0.028807909080493008, "grad_norm": 0.09207413345575333, "learning_rate": 0.0002, "loss": 0.0316, "step": 990 }, { "epoch": 0.029098898061104047, "grad_norm": 0.056632377207279205, "learning_rate": 0.0002, "loss": 0.0331, "step": 1000 } ], "logging_steps": 10, "max_steps": 40000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.102264300163564e+18, "train_batch_size": 96, "trial_name": null, "trial_params": null }