| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6.369426751592357e-06, | |
| "loss": 24.9631, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.2738853503184714e-05, | |
| "loss": 20.8416, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 1.910828025477707e-05, | |
| "loss": 18.8244, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 10.0, | |
| "learning_rate": 2.5477707006369428e-05, | |
| "loss": 20.6664, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 3.184713375796178e-05, | |
| "loss": 17.5711, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 10.0, | |
| "learning_rate": 3.821656050955414e-05, | |
| "loss": 15.9775, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 10.0, | |
| "learning_rate": 4.45859872611465e-05, | |
| "loss": 15.0004, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 5.0955414012738855e-05, | |
| "loss": 13.074, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 5.732484076433121e-05, | |
| "loss": 10.9575, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 6.369426751592356e-05, | |
| "loss": 8.7852, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 7.006369426751592e-05, | |
| "loss": 8.9422, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 10.0, | |
| "learning_rate": 7.643312101910829e-05, | |
| "loss": 6.8078, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 10.0, | |
| "learning_rate": 8.280254777070065e-05, | |
| "loss": 6.0916, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.9171974522293e-05, | |
| "loss": 5.8139, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.554140127388536e-05, | |
| "loss": 5.7631, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 9.989892183288411e-05, | |
| "loss": 4.6852, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.956199460916442e-05, | |
| "loss": 4.6054, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.922506738544474e-05, | |
| "loss": 4.0257, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.888814016172507e-05, | |
| "loss": 4.1777, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.85512129380054e-05, | |
| "loss": 4.0922, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 9.821428571428572e-05, | |
| "loss": 3.5944, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.787735849056603e-05, | |
| "loss": 3.5933, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.754043126684636e-05, | |
| "loss": 3.684, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.720350404312669e-05, | |
| "loss": 3.1428, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.686657681940702e-05, | |
| "loss": 2.9727, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.652964959568734e-05, | |
| "loss": 3.2243, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 9.463090896606445, | |
| "learning_rate": 9.619272237196765e-05, | |
| "loss": 2.9269, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.585579514824798e-05, | |
| "loss": 2.955, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.551886792452831e-05, | |
| "loss": 2.5544, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.518194070080863e-05, | |
| "loss": 2.7417, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 9.484501347708896e-05, | |
| "loss": 2.569, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 9.225275039672852, | |
| "learning_rate": 9.450808625336927e-05, | |
| "loss": 2.3499, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 9.41711590296496e-05, | |
| "loss": 2.6455, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 9.915996551513672, | |
| "learning_rate": 9.383423180592993e-05, | |
| "loss": 2.3627, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 9.526017189025879, | |
| "learning_rate": 9.349730458221025e-05, | |
| "loss": 2.3267, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.316037735849057e-05, | |
| "loss": 2.1942, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.282345013477089e-05, | |
| "loss": 2.2241, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 9.248652291105122e-05, | |
| "loss": 2.1867, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 6.984477996826172, | |
| "learning_rate": 9.214959568733154e-05, | |
| "loss": 2.1133, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.181266846361186e-05, | |
| "loss": 2.1515, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 9.388639450073242, | |
| "learning_rate": 9.14757412398922e-05, | |
| "loss": 2.191, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 6.955385684967041, | |
| "learning_rate": 9.113881401617251e-05, | |
| "loss": 2.05, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 8.776995658874512, | |
| "learning_rate": 9.080188679245284e-05, | |
| "loss": 1.9369, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 9.813057899475098, | |
| "learning_rate": 9.046495956873315e-05, | |
| "loss": 1.8571, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 9.012803234501348e-05, | |
| "loss": 1.8026, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.979110512129381e-05, | |
| "loss": 1.6484, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 6.5526018142700195, | |
| "learning_rate": 8.945417789757413e-05, | |
| "loss": 1.601, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.911725067385444e-05, | |
| "loss": 1.5064, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 7.3129496574401855, | |
| "learning_rate": 8.878032345013477e-05, | |
| "loss": 1.6099, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 8.732329368591309, | |
| "learning_rate": 8.84433962264151e-05, | |
| "loss": 1.5446, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 6.793776512145996, | |
| "learning_rate": 8.810646900269543e-05, | |
| "loss": 1.3392, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 8.43354606628418, | |
| "learning_rate": 8.776954177897575e-05, | |
| "loss": 1.3408, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 7.959011077880859, | |
| "learning_rate": 8.743261455525606e-05, | |
| "loss": 1.3075, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 10.0, | |
| "learning_rate": 8.709568733153639e-05, | |
| "loss": 1.4838, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 8.968912124633789, | |
| "learning_rate": 8.675876010781672e-05, | |
| "loss": 1.3092, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 10.0, | |
| "learning_rate": 8.642183288409704e-05, | |
| "loss": 1.3005, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.608490566037735e-05, | |
| "loss": 1.1666, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.574797843665768e-05, | |
| "loss": 1.292, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 9.630219459533691, | |
| "learning_rate": 8.541105121293801e-05, | |
| "loss": 1.2324, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.507412398921834e-05, | |
| "loss": 1.2701, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 10.0, | |
| "learning_rate": 8.473719676549866e-05, | |
| "loss": 1.1545, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 8.581560134887695, | |
| "learning_rate": 8.440026954177897e-05, | |
| "loss": 1.0769, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 9.464119911193848, | |
| "learning_rate": 8.40633423180593e-05, | |
| "loss": 1.1256, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 6.58554744720459, | |
| "learning_rate": 8.372641509433963e-05, | |
| "loss": 1.0227, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 6.532876491546631, | |
| "learning_rate": 8.338948787061996e-05, | |
| "loss": 1.077, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 7.9927263259887695, | |
| "learning_rate": 8.305256064690027e-05, | |
| "loss": 1.0998, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 5.303004264831543, | |
| "learning_rate": 8.271563342318059e-05, | |
| "loss": 0.9585, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 6.559380531311035, | |
| "learning_rate": 8.237870619946092e-05, | |
| "loss": 0.8816, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 7.447219371795654, | |
| "learning_rate": 8.204177897574125e-05, | |
| "loss": 1.1047, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 8.270160675048828, | |
| "learning_rate": 8.170485175202158e-05, | |
| "loss": 0.8914, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.136792452830189e-05, | |
| "loss": 0.9932, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 8.01986312866211, | |
| "learning_rate": 8.103099730458221e-05, | |
| "loss": 0.9273, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 5.2210917472839355, | |
| "learning_rate": 8.069407008086254e-05, | |
| "loss": 1.0231, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 5.701915740966797, | |
| "learning_rate": 8.035714285714287e-05, | |
| "loss": 0.8625, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 8.181998252868652, | |
| "learning_rate": 8.002021563342318e-05, | |
| "loss": 0.9228, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 6.676185131072998, | |
| "learning_rate": 7.968328840970351e-05, | |
| "loss": 0.8675, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 6.564544677734375, | |
| "learning_rate": 7.934636118598383e-05, | |
| "loss": 0.9035, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 7.900943396226416e-05, | |
| "loss": 0.8019, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 7.867250673854449e-05, | |
| "loss": 0.8066, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 10.0, | |
| "learning_rate": 7.83355795148248e-05, | |
| "loss": 0.8352, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 7.660287857055664, | |
| "learning_rate": 7.799865229110512e-05, | |
| "loss": 0.8574, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 4.782571792602539, | |
| "learning_rate": 7.766172506738545e-05, | |
| "loss": 0.8104, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 7.636229991912842, | |
| "learning_rate": 7.732479784366577e-05, | |
| "loss": 0.8074, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 5.22883939743042, | |
| "learning_rate": 7.69878706199461e-05, | |
| "loss": 0.7614, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 6.009452819824219, | |
| "learning_rate": 7.665094339622642e-05, | |
| "loss": 0.7084, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 7.293655872344971, | |
| "learning_rate": 7.631401617250674e-05, | |
| "loss": 0.7718, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 5.065558433532715, | |
| "learning_rate": 7.597708894878706e-05, | |
| "loss": 0.7292, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 7.277278423309326, | |
| "learning_rate": 7.56401617250674e-05, | |
| "loss": 0.7474, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 9.6370210647583, | |
| "learning_rate": 7.530323450134771e-05, | |
| "loss": 0.7241, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 7.019769191741943, | |
| "learning_rate": 7.496630727762804e-05, | |
| "loss": 0.6728, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 5.834097862243652, | |
| "learning_rate": 7.462938005390835e-05, | |
| "loss": 0.7054, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 7.429245283018868e-05, | |
| "loss": 0.7325, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 8.013876914978027, | |
| "learning_rate": 7.395552560646901e-05, | |
| "loss": 0.7168, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 5.732053756713867, | |
| "learning_rate": 7.361859838274933e-05, | |
| "loss": 0.5667, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 4.955954551696777, | |
| "learning_rate": 7.328167115902966e-05, | |
| "loss": 0.6504, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 4.867282390594482, | |
| "learning_rate": 7.294474393530997e-05, | |
| "loss": 0.5968, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 6.085322856903076, | |
| "learning_rate": 7.26078167115903e-05, | |
| "loss": 0.6032, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 6.458901882171631, | |
| "learning_rate": 7.227088948787062e-05, | |
| "loss": 0.6642, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 8.118791580200195, | |
| "learning_rate": 7.193396226415095e-05, | |
| "loss": 0.6572, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 6.958062171936035, | |
| "learning_rate": 7.159703504043128e-05, | |
| "loss": 0.6236, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 4.629474639892578, | |
| "learning_rate": 7.126010781671159e-05, | |
| "loss": 0.5852, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 5.775842189788818, | |
| "learning_rate": 7.092318059299192e-05, | |
| "loss": 0.6545, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 5.303742408752441, | |
| "learning_rate": 7.058625336927224e-05, | |
| "loss": 0.5677, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 6.655779838562012, | |
| "learning_rate": 7.024932614555257e-05, | |
| "loss": 0.5463, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 6.062790870666504, | |
| "learning_rate": 6.99123989218329e-05, | |
| "loss": 0.5669, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 4.374266147613525, | |
| "learning_rate": 6.957547169811321e-05, | |
| "loss": 0.5475, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 4.560005187988281, | |
| "learning_rate": 6.923854447439353e-05, | |
| "loss": 0.5624, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 4.675742149353027, | |
| "learning_rate": 6.890161725067386e-05, | |
| "loss": 0.5455, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 5.191764831542969, | |
| "learning_rate": 6.856469002695418e-05, | |
| "loss": 0.5485, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 6.050549030303955, | |
| "learning_rate": 6.822776280323451e-05, | |
| "loss": 0.5439, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 7.021573543548584, | |
| "learning_rate": 6.789083557951483e-05, | |
| "loss": 0.4899, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 5.938823699951172, | |
| "learning_rate": 6.755390835579514e-05, | |
| "loss": 0.499, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 6.675471305847168, | |
| "learning_rate": 6.721698113207547e-05, | |
| "loss": 0.4657, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 4.019260883331299, | |
| "learning_rate": 6.68800539083558e-05, | |
| "loss": 0.4819, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 6.338311195373535, | |
| "learning_rate": 6.654312668463612e-05, | |
| "loss": 0.4471, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 5.028086185455322, | |
| "learning_rate": 6.620619946091643e-05, | |
| "loss": 0.4334, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 3.6760733127593994, | |
| "learning_rate": 6.586927223719676e-05, | |
| "loss": 0.4569, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 5.685121059417725, | |
| "learning_rate": 6.553234501347709e-05, | |
| "loss": 0.4314, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 4.894414901733398, | |
| "learning_rate": 6.519541778975742e-05, | |
| "loss": 0.4527, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 6.624325275421143, | |
| "learning_rate": 6.485849056603774e-05, | |
| "loss": 0.3814, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 7.109598636627197, | |
| "learning_rate": 6.452156334231805e-05, | |
| "loss": 0.4354, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 5.322418689727783, | |
| "learning_rate": 6.418463611859838e-05, | |
| "loss": 0.4103, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 4.165887832641602, | |
| "learning_rate": 6.384770889487871e-05, | |
| "loss": 0.414, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 2.6662333011627197, | |
| "learning_rate": 6.351078167115904e-05, | |
| "loss": 0.4091, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.60746431350708, | |
| "learning_rate": 6.317385444743936e-05, | |
| "loss": 0.434, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 4.521622180938721, | |
| "learning_rate": 6.283692722371967e-05, | |
| "loss": 0.3858, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 3.3457305431365967, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.3935, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 3.885206699371338, | |
| "learning_rate": 6.216307277628033e-05, | |
| "loss": 0.4036, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 3.958425998687744, | |
| "learning_rate": 6.182614555256066e-05, | |
| "loss": 0.3972, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 4.6747846603393555, | |
| "learning_rate": 6.148921832884098e-05, | |
| "loss": 0.359, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 3.6438868045806885, | |
| "learning_rate": 6.115229110512129e-05, | |
| "loss": 0.3843, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 4.175204753875732, | |
| "learning_rate": 6.081536388140162e-05, | |
| "loss": 0.3952, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 2.771331548690796, | |
| "learning_rate": 6.047843665768195e-05, | |
| "loss": 0.363, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 3.504378080368042, | |
| "learning_rate": 6.0141509433962265e-05, | |
| "loss": 0.3401, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 4.495083808898926, | |
| "learning_rate": 5.980458221024259e-05, | |
| "loss": 0.335, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 3.8098154067993164, | |
| "learning_rate": 5.9467654986522916e-05, | |
| "loss": 0.3138, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 4.181695938110352, | |
| "learning_rate": 5.913072776280324e-05, | |
| "loss": 0.3234, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 3.0462260246276855, | |
| "learning_rate": 5.879380053908357e-05, | |
| "loss": 0.2771, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 3.0168416500091553, | |
| "learning_rate": 5.8456873315363884e-05, | |
| "loss": 0.3024, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 4.196632385253906, | |
| "learning_rate": 5.8119946091644206e-05, | |
| "loss": 0.307, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 3.4224884510040283, | |
| "learning_rate": 5.7783018867924535e-05, | |
| "loss": 0.3261, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 3.340468645095825, | |
| "learning_rate": 5.744609164420486e-05, | |
| "loss": 0.3116, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 3.3401427268981934, | |
| "learning_rate": 5.710916442048517e-05, | |
| "loss": 0.3037, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 3.3940038681030273, | |
| "learning_rate": 5.6772237196765496e-05, | |
| "loss": 0.2975, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 2.9820821285247803, | |
| "learning_rate": 5.6435309973045825e-05, | |
| "loss": 0.288, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 3.953834056854248, | |
| "learning_rate": 5.609838274932615e-05, | |
| "loss": 0.2998, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 3.664151906967163, | |
| "learning_rate": 5.5761455525606476e-05, | |
| "loss": 0.3029, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 5.874343395233154, | |
| "learning_rate": 5.542452830188679e-05, | |
| "loss": 0.308, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 5.167667388916016, | |
| "learning_rate": 5.5087601078167114e-05, | |
| "loss": 0.293, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.0990707874298096, | |
| "learning_rate": 5.4750673854447444e-05, | |
| "loss": 0.2736, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 3.07232403755188, | |
| "learning_rate": 5.4413746630727766e-05, | |
| "loss": 0.2802, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 3.340930461883545, | |
| "learning_rate": 5.407681940700808e-05, | |
| "loss": 0.2773, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 2.88307523727417, | |
| "learning_rate": 5.373989218328841e-05, | |
| "loss": 0.2645, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 3.0353357791900635, | |
| "learning_rate": 5.340296495956873e-05, | |
| "loss": 0.2864, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 2.4584333896636963, | |
| "learning_rate": 5.306603773584906e-05, | |
| "loss": 0.2674, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 2.793869972229004, | |
| "learning_rate": 5.2729110512129385e-05, | |
| "loss": 0.2425, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 2.5717570781707764, | |
| "learning_rate": 5.23921832884097e-05, | |
| "loss": 0.245, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 2.962009906768799, | |
| "learning_rate": 5.205525606469003e-05, | |
| "loss": 0.2553, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 2.053489923477173, | |
| "learning_rate": 5.171832884097035e-05, | |
| "loss": 0.2338, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 3.5270988941192627, | |
| "learning_rate": 5.138140161725068e-05, | |
| "loss": 0.2349, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 3.227064847946167, | |
| "learning_rate": 5.1044474393531e-05, | |
| "loss": 0.2475, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 3.002271890640259, | |
| "learning_rate": 5.070754716981132e-05, | |
| "loss": 0.2264, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 2.5545215606689453, | |
| "learning_rate": 5.037061994609165e-05, | |
| "loss": 0.2473, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 3.1033124923706055, | |
| "learning_rate": 5.003369272237197e-05, | |
| "loss": 0.228, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 2.90221905708313, | |
| "learning_rate": 4.969676549865229e-05, | |
| "loss": 0.2337, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 3.5613458156585693, | |
| "learning_rate": 4.9359838274932616e-05, | |
| "loss": 0.2322, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 2.788691282272339, | |
| "learning_rate": 4.902291105121294e-05, | |
| "loss": 0.2153, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 2.7116010189056396, | |
| "learning_rate": 4.868598382749327e-05, | |
| "loss": 0.2249, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 2.3744070529937744, | |
| "learning_rate": 4.834905660377358e-05, | |
| "loss": 0.2265, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 2.920132637023926, | |
| "learning_rate": 4.801212938005391e-05, | |
| "loss": 0.2132, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 2.0210254192352295, | |
| "learning_rate": 4.7675202156334234e-05, | |
| "loss": 0.2061, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 2.7059290409088135, | |
| "learning_rate": 4.733827493261456e-05, | |
| "loss": 0.1875, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 2.262721061706543, | |
| "learning_rate": 4.7001347708894886e-05, | |
| "loss": 0.2012, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 3.484440326690674, | |
| "learning_rate": 4.66644204851752e-05, | |
| "loss": 0.2325, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.090310573577881, | |
| "learning_rate": 4.632749326145553e-05, | |
| "loss": 0.2051, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 2.2044506072998047, | |
| "learning_rate": 4.5990566037735846e-05, | |
| "loss": 0.1883, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 2.702648878097534, | |
| "learning_rate": 4.5653638814016176e-05, | |
| "loss": 0.2145, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 4.53167115902965e-05, | |
| "loss": 0.2058, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 5.150343894958496, | |
| "learning_rate": 4.497978436657682e-05, | |
| "loss": 0.2181, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 1.4523239135742188, | |
| "learning_rate": 4.464285714285715e-05, | |
| "loss": 0.1986, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 2.1557908058166504, | |
| "learning_rate": 4.4305929919137465e-05, | |
| "loss": 0.1908, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 2.1338348388671875, | |
| "learning_rate": 4.3969002695417794e-05, | |
| "loss": 0.185, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 2.266662836074829, | |
| "learning_rate": 4.363207547169812e-05, | |
| "loss": 0.1899, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 1.9031009674072266, | |
| "learning_rate": 4.329514824797844e-05, | |
| "loss": 0.1799, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 1.731602668762207, | |
| "learning_rate": 4.295822102425876e-05, | |
| "loss": 0.1965, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 1.3521720170974731, | |
| "learning_rate": 4.2621293800539084e-05, | |
| "loss": 0.1998, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 2.028102397918701, | |
| "learning_rate": 4.2284366576819406e-05, | |
| "loss": 0.1873, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 3.0926480293273926, | |
| "learning_rate": 4.1947439353099736e-05, | |
| "loss": 0.1821, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 2.1551952362060547, | |
| "learning_rate": 4.161051212938006e-05, | |
| "loss": 0.1737, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 2.661207437515259, | |
| "learning_rate": 4.127358490566038e-05, | |
| "loss": 0.1898, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 2.934997081756592, | |
| "learning_rate": 4.09366576819407e-05, | |
| "loss": 0.1902, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 1.7465407848358154, | |
| "learning_rate": 4.0599730458221025e-05, | |
| "loss": 0.1744, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 2.341937780380249, | |
| "learning_rate": 4.026280323450135e-05, | |
| "loss": 0.1749, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 2.418426513671875, | |
| "learning_rate": 3.992587601078167e-05, | |
| "loss": 0.164, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 2.408447265625, | |
| "learning_rate": 3.9588948787062e-05, | |
| "loss": 0.1682, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 1.976258397102356, | |
| "learning_rate": 3.9252021563342315e-05, | |
| "loss": 0.1676, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 2.601335287094116, | |
| "learning_rate": 3.8915094339622644e-05, | |
| "loss": 0.1572, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 2.3064210414886475, | |
| "learning_rate": 3.8578167115902966e-05, | |
| "loss": 0.1545, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 2.60603666305542, | |
| "learning_rate": 3.824123989218329e-05, | |
| "loss": 0.1657, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.5705275535583496, | |
| "learning_rate": 3.790431266846362e-05, | |
| "loss": 0.165, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 3.4823734760284424, | |
| "learning_rate": 3.7567385444743934e-05, | |
| "loss": 0.1705, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 1.5383442640304565, | |
| "learning_rate": 3.723045822102426e-05, | |
| "loss": 0.1502, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 2.4880733489990234, | |
| "learning_rate": 3.689353099730458e-05, | |
| "loss": 0.1598, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 2.6558523178100586, | |
| "learning_rate": 3.655660377358491e-05, | |
| "loss": 0.1582, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 2.7092580795288086, | |
| "learning_rate": 3.621967654986524e-05, | |
| "loss": 0.1596, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 2.1501896381378174, | |
| "learning_rate": 3.588274932614555e-05, | |
| "loss": 0.1505, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 1.9364612102508545, | |
| "learning_rate": 3.554582210242588e-05, | |
| "loss": 0.1498, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 1.987347960472107, | |
| "learning_rate": 3.52088948787062e-05, | |
| "loss": 0.1561, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 2.656722068786621, | |
| "learning_rate": 3.4871967654986526e-05, | |
| "loss": 0.153, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 2.4392051696777344, | |
| "learning_rate": 3.453504043126685e-05, | |
| "loss": 0.162, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 2.220731019973755, | |
| "learning_rate": 3.419811320754717e-05, | |
| "loss": 0.1627, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 1.5795049667358398, | |
| "learning_rate": 3.3861185983827494e-05, | |
| "loss": 0.1652, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 2.1749913692474365, | |
| "learning_rate": 3.3524258760107816e-05, | |
| "loss": 0.1606, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 2.301445960998535, | |
| "learning_rate": 3.3187331536388145e-05, | |
| "loss": 0.1461, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 2.8436062335968018, | |
| "learning_rate": 3.285040431266847e-05, | |
| "loss": 0.1498, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 1.9361361265182495, | |
| "learning_rate": 3.251347708894879e-05, | |
| "loss": 0.1543, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 1.9528151750564575, | |
| "learning_rate": 3.217654986522911e-05, | |
| "loss": 0.1402, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 2.5773980617523193, | |
| "learning_rate": 3.1839622641509435e-05, | |
| "loss": 0.1329, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 2.697664976119995, | |
| "learning_rate": 3.150269541778976e-05, | |
| "loss": 0.1574, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 2.53369402885437, | |
| "learning_rate": 3.1165768194070086e-05, | |
| "loss": 0.1513, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 2.3387765884399414, | |
| "learning_rate": 3.08288409703504e-05, | |
| "loss": 0.1463, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 1.6529033184051514, | |
| "learning_rate": 3.0491913746630728e-05, | |
| "loss": 0.1373, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 2.692295551300049, | |
| "learning_rate": 3.0154986522911054e-05, | |
| "loss": 0.1325, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 2.09934401512146, | |
| "learning_rate": 2.9818059299191376e-05, | |
| "loss": 0.1484, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.7355453968048096, | |
| "learning_rate": 2.9481132075471702e-05, | |
| "loss": 0.1341, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 1.8107614517211914, | |
| "learning_rate": 2.914420485175202e-05, | |
| "loss": 0.1475, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 1.958337426185608, | |
| "learning_rate": 2.8807277628032347e-05, | |
| "loss": 0.1438, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 1.456838607788086, | |
| "learning_rate": 2.847035040431267e-05, | |
| "loss": 0.1395, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 1.997288703918457, | |
| "learning_rate": 2.8133423180592995e-05, | |
| "loss": 0.1329, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 2.1655972003936768, | |
| "learning_rate": 2.7796495956873314e-05, | |
| "loss": 0.1357, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 1.8002907037734985, | |
| "learning_rate": 2.745956873315364e-05, | |
| "loss": 0.1444, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 2.862435817718506, | |
| "learning_rate": 2.7122641509433965e-05, | |
| "loss": 0.1455, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 1.9556299448013306, | |
| "learning_rate": 2.6785714285714288e-05, | |
| "loss": 0.1455, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 2.489349365234375, | |
| "learning_rate": 2.6448787061994614e-05, | |
| "loss": 0.1359, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 2.1165363788604736, | |
| "learning_rate": 2.6111859838274933e-05, | |
| "loss": 0.1448, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 1.813704013824463, | |
| "learning_rate": 2.577493261455526e-05, | |
| "loss": 0.1341, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 2.7160651683807373, | |
| "learning_rate": 2.5438005390835577e-05, | |
| "loss": 0.1255, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 1.3045297861099243, | |
| "learning_rate": 2.5101078167115903e-05, | |
| "loss": 0.1334, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 2.3194830417633057, | |
| "learning_rate": 2.476415094339623e-05, | |
| "loss": 0.1538, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.9577467441558838, | |
| "learning_rate": 2.442722371967655e-05, | |
| "loss": 0.1407, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 1.4686191082000732, | |
| "learning_rate": 2.4090296495956874e-05, | |
| "loss": 0.1273, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 2.1956889629364014, | |
| "learning_rate": 2.3753369272237196e-05, | |
| "loss": 0.1386, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 1.6341290473937988, | |
| "learning_rate": 2.341644204851752e-05, | |
| "loss": 0.1298, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 2.429835081100464, | |
| "learning_rate": 2.3079514824797844e-05, | |
| "loss": 0.1283, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 2.274258760107817e-05, | |
| "loss": 0.2458, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 1.7494025230407715, | |
| "learning_rate": 2.2405660377358493e-05, | |
| "loss": 0.1338, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 2.2596070766448975, | |
| "learning_rate": 2.2068733153638815e-05, | |
| "loss": 0.1208, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 2.0361554622650146, | |
| "learning_rate": 2.1731805929919137e-05, | |
| "loss": 0.1403, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 2.3333733081817627, | |
| "learning_rate": 2.1394878706199463e-05, | |
| "loss": 0.1222, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.4406805038452148, | |
| "learning_rate": 2.1057951482479785e-05, | |
| "loss": 0.1317, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 1.7497354745864868, | |
| "learning_rate": 2.0721024258760108e-05, | |
| "loss": 0.129, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 1.8802416324615479, | |
| "learning_rate": 2.038409703504043e-05, | |
| "loss": 0.1305, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 1.733115792274475, | |
| "learning_rate": 2.0047169811320756e-05, | |
| "loss": 0.1266, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 2.586423397064209, | |
| "learning_rate": 1.971024258760108e-05, | |
| "loss": 0.1336, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 1.6050171852111816, | |
| "learning_rate": 1.9373315363881404e-05, | |
| "loss": 0.134, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 1.717938780784607, | |
| "learning_rate": 1.9036388140161727e-05, | |
| "loss": 0.1252, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 1.871017575263977, | |
| "learning_rate": 1.869946091644205e-05, | |
| "loss": 0.1217, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 2.6902308464050293, | |
| "learning_rate": 1.836253369272237e-05, | |
| "loss": 0.1235, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 1.9302878379821777, | |
| "learning_rate": 1.8025606469002694e-05, | |
| "loss": 0.1303, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 2.0468838214874268, | |
| "learning_rate": 1.768867924528302e-05, | |
| "loss": 0.1176, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 1.396986722946167, | |
| "learning_rate": 1.7351752021563345e-05, | |
| "loss": 0.1163, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 1.742895245552063, | |
| "learning_rate": 1.7014824797843668e-05, | |
| "loss": 0.118, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 1.870067834854126, | |
| "learning_rate": 1.667789757412399e-05, | |
| "loss": 0.1229, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 1.7094252109527588, | |
| "learning_rate": 1.6340970350404313e-05, | |
| "loss": 0.1384, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 2.3981940746307373, | |
| "learning_rate": 1.600404312668464e-05, | |
| "loss": 0.1107, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 2.565001964569092, | |
| "learning_rate": 1.566711590296496e-05, | |
| "loss": 0.1152, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 2.4875590801239014, | |
| "learning_rate": 1.5330188679245283e-05, | |
| "loss": 0.1227, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 2.2726962566375732, | |
| "learning_rate": 1.4993261455525606e-05, | |
| "loss": 0.1172, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 1.7912529706954956, | |
| "learning_rate": 1.465633423180593e-05, | |
| "loss": 0.1124, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.7430084943771362, | |
| "learning_rate": 1.4319407008086256e-05, | |
| "loss": 0.1215, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 2.2106008529663086, | |
| "learning_rate": 1.3982479784366578e-05, | |
| "loss": 0.1185, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 2.9677484035491943, | |
| "learning_rate": 1.3645552560646902e-05, | |
| "loss": 0.1241, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 1.9449735879898071, | |
| "learning_rate": 1.3308625336927224e-05, | |
| "loss": 0.1173, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 2.0924923419952393, | |
| "learning_rate": 1.2971698113207547e-05, | |
| "loss": 0.1253, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.4807486534118652, | |
| "learning_rate": 1.2634770889487871e-05, | |
| "loss": 0.109, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 2.2792012691497803, | |
| "learning_rate": 1.2297843665768195e-05, | |
| "loss": 0.1183, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 1.7402423620224, | |
| "learning_rate": 1.1960916442048519e-05, | |
| "loss": 0.1028, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 1.4183549880981445, | |
| "learning_rate": 1.1623989218328842e-05, | |
| "loss": 0.109, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 2.241910934448242, | |
| "learning_rate": 1.1287061994609164e-05, | |
| "loss": 0.1084, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.7856048345565796, | |
| "learning_rate": 1.0950134770889488e-05, | |
| "loss": 0.1181, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 1.8856292963027954, | |
| "learning_rate": 1.0613207547169812e-05, | |
| "loss": 0.1205, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 2.5832862854003906, | |
| "learning_rate": 1.0276280323450135e-05, | |
| "loss": 0.1255, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 1.7440129518508911, | |
| "learning_rate": 9.939353099730459e-06, | |
| "loss": 0.1186, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 1.7217051982879639, | |
| "learning_rate": 9.602425876010781e-06, | |
| "loss": 0.1159, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 2.0266754627227783, | |
| "learning_rate": 9.265498652291107e-06, | |
| "loss": 0.1112, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 1.816345453262329, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.1225, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 1.9036134481430054, | |
| "learning_rate": 8.591644204851752e-06, | |
| "loss": 0.1215, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 1.8858239650726318, | |
| "learning_rate": 8.254716981132076e-06, | |
| "loss": 0.1015, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 1.9040420055389404, | |
| "learning_rate": 7.9177897574124e-06, | |
| "loss": 0.1115, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 2.292043447494507, | |
| "learning_rate": 7.580862533692723e-06, | |
| "loss": 0.1208, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 2.022998809814453, | |
| "learning_rate": 7.243935309973046e-06, | |
| "loss": 0.1041, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 2.045623779296875, | |
| "learning_rate": 6.90700808625337e-06, | |
| "loss": 0.1115, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 1.6464377641677856, | |
| "learning_rate": 6.570080862533692e-06, | |
| "loss": 0.112, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 1.6890789270401, | |
| "learning_rate": 6.233153638814016e-06, | |
| "loss": 0.1081, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 2.4978396892547607, | |
| "learning_rate": 5.89622641509434e-06, | |
| "loss": 0.1147, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 1.9748504161834717, | |
| "learning_rate": 5.5592991913746634e-06, | |
| "loss": 0.1019, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 2.1371240615844727, | |
| "learning_rate": 5.222371967654987e-06, | |
| "loss": 0.1114, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 2.8856780529022217, | |
| "learning_rate": 4.88544474393531e-06, | |
| "loss": 0.1142, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 1.8940197229385376, | |
| "learning_rate": 4.548517520215634e-06, | |
| "loss": 0.1004, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.2457224130630493, | |
| "learning_rate": 4.211590296495957e-06, | |
| "loss": 0.1063, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 3.265352725982666, | |
| "learning_rate": 3.8746630727762805e-06, | |
| "loss": 0.1196, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 2.0506467819213867, | |
| "learning_rate": 3.5377358490566038e-06, | |
| "loss": 0.1153, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 1.4757201671600342, | |
| "learning_rate": 3.200808625336928e-06, | |
| "loss": 0.1068, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 2.700068473815918, | |
| "learning_rate": 2.8638814016172507e-06, | |
| "loss": 0.1119, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 2.0698773860931396, | |
| "learning_rate": 2.5269541778975744e-06, | |
| "loss": 0.1051, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 1.6540876626968384, | |
| "learning_rate": 2.1900269541778976e-06, | |
| "loss": 0.1101, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 1.8899681568145752, | |
| "learning_rate": 1.853099730458221e-06, | |
| "loss": 0.1118, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 1.6223585605621338, | |
| "learning_rate": 1.5161725067385445e-06, | |
| "loss": 0.1087, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 1.9171777963638306, | |
| "learning_rate": 1.179245283018868e-06, | |
| "loss": 0.1058, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 1.4021532535552979, | |
| "learning_rate": 8.423180592991913e-07, | |
| "loss": 0.1044, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 2.6420481204986572, | |
| "learning_rate": 5.053908355795148e-07, | |
| "loss": 0.1223, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 1.7279895544052124, | |
| "learning_rate": 1.684636118598383e-07, | |
| "loss": 0.1092, | |
| "step": 3120 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |