| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032, | |
| "grad_norm": 10.000000953674316, | |
| "learning_rate": 6.369426751592357e-06, | |
| "loss": 24.9631, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.2738853503184714e-05, | |
| "loss": 20.8406, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0096, | |
| "grad_norm": 10.0, | |
| "learning_rate": 1.910828025477707e-05, | |
| "loss": 18.8175, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 10.000000953674316, | |
| "learning_rate": 2.5477707006369428e-05, | |
| "loss": 20.6395, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 3.184713375796178e-05, | |
| "loss": 17.492, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 10.0, | |
| "learning_rate": 3.821656050955414e-05, | |
| "loss": 15.7832, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0224, | |
| "grad_norm": 10.0, | |
| "learning_rate": 4.45859872611465e-05, | |
| "loss": 14.5836, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 5.0955414012738855e-05, | |
| "loss": 12.4576, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0288, | |
| "grad_norm": 10.0, | |
| "learning_rate": 5.732484076433121e-05, | |
| "loss": 10.0393, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 6.369426751592356e-05, | |
| "loss": 7.721, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0352, | |
| "grad_norm": 10.000000953674316, | |
| "learning_rate": 7.006369426751592e-05, | |
| "loss": 7.7382, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 7.643312101910829e-05, | |
| "loss": 5.6715, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0416, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.280254777070065e-05, | |
| "loss": 5.0697, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.9171974522293e-05, | |
| "loss": 4.6669, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.554140127388536e-05, | |
| "loss": 4.5226, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.989892183288411e-05, | |
| "loss": 3.7166, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0544, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.956199460916442e-05, | |
| "loss": 3.6585, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 9.922506738544474e-05, | |
| "loss": 3.0612, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0608, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.888814016172507e-05, | |
| "loss": 3.1613, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 9.628771781921387, | |
| "learning_rate": 9.85512129380054e-05, | |
| "loss": 3.059, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0672, | |
| "grad_norm": 9.586786270141602, | |
| "learning_rate": 9.821428571428572e-05, | |
| "loss": 2.7733, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 9.884883880615234, | |
| "learning_rate": 9.787735849056603e-05, | |
| "loss": 2.6752, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0736, | |
| "grad_norm": 8.088533401489258, | |
| "learning_rate": 9.754043126684636e-05, | |
| "loss": 2.7686, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 7.879763603210449, | |
| "learning_rate": 9.720350404312669e-05, | |
| "loss": 2.463, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 10.0, | |
| "learning_rate": 9.686657681940702e-05, | |
| "loss": 2.2555, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 8.265613555908203, | |
| "learning_rate": 9.652964959568734e-05, | |
| "loss": 2.4123, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.0864, | |
| "grad_norm": 8.316008567810059, | |
| "learning_rate": 9.619272237196765e-05, | |
| "loss": 2.1918, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 9.114105224609375, | |
| "learning_rate": 9.585579514824798e-05, | |
| "loss": 2.2272, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0928, | |
| "grad_norm": 9.300642013549805, | |
| "learning_rate": 9.551886792452831e-05, | |
| "loss": 1.9593, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 9.566454887390137, | |
| "learning_rate": 9.518194070080863e-05, | |
| "loss": 2.0377, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0992, | |
| "grad_norm": 8.621969223022461, | |
| "learning_rate": 9.484501347708896e-05, | |
| "loss": 1.9443, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 7.225710391998291, | |
| "learning_rate": 9.450808625336927e-05, | |
| "loss": 1.8093, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1056, | |
| "grad_norm": 9.70196533203125, | |
| "learning_rate": 9.41711590296496e-05, | |
| "loss": 1.9922, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 7.131272792816162, | |
| "learning_rate": 9.383423180592993e-05, | |
| "loss": 1.8382, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 7.785069465637207, | |
| "learning_rate": 9.349730458221025e-05, | |
| "loss": 1.7659, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 8.18884563446045, | |
| "learning_rate": 9.316037735849057e-05, | |
| "loss": 1.6541, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1184, | |
| "grad_norm": 8.7020263671875, | |
| "learning_rate": 9.282345013477089e-05, | |
| "loss": 1.6964, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 9.419608116149902, | |
| "learning_rate": 9.248652291105122e-05, | |
| "loss": 1.7058, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1248, | |
| "grad_norm": 4.900589942932129, | |
| "learning_rate": 9.214959568733154e-05, | |
| "loss": 1.5773, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 9.17340087890625, | |
| "learning_rate": 9.181266846361186e-05, | |
| "loss": 1.7163, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1312, | |
| "grad_norm": 7.4362616539001465, | |
| "learning_rate": 9.14757412398922e-05, | |
| "loss": 1.6505, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 6.498612880706787, | |
| "learning_rate": 9.113881401617251e-05, | |
| "loss": 1.5706, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.1376, | |
| "grad_norm": 7.84408712387085, | |
| "learning_rate": 9.080188679245284e-05, | |
| "loss": 1.5137, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 9.464787483215332, | |
| "learning_rate": 9.046495956873315e-05, | |
| "loss": 1.5546, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 8.63725757598877, | |
| "learning_rate": 9.012803234501348e-05, | |
| "loss": 1.4404, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 9.00522232055664, | |
| "learning_rate": 8.979110512129381e-05, | |
| "loss": 1.3584, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1504, | |
| "grad_norm": 6.751660346984863, | |
| "learning_rate": 8.945417789757413e-05, | |
| "loss": 1.2826, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 9.651211738586426, | |
| "learning_rate": 8.911725067385444e-05, | |
| "loss": 1.2767, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1568, | |
| "grad_norm": 7.526703357696533, | |
| "learning_rate": 8.878032345013477e-05, | |
| "loss": 1.3371, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 5.82845401763916, | |
| "learning_rate": 8.84433962264151e-05, | |
| "loss": 1.2118, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1632, | |
| "grad_norm": 6.827119827270508, | |
| "learning_rate": 8.810646900269543e-05, | |
| "loss": 1.0983, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 7.042975425720215, | |
| "learning_rate": 8.776954177897575e-05, | |
| "loss": 1.1228, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1696, | |
| "grad_norm": 6.176724910736084, | |
| "learning_rate": 8.743261455525606e-05, | |
| "loss": 0.9704, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.709568733153639e-05, | |
| "loss": 1.183, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 5.635732650756836, | |
| "learning_rate": 8.675876010781672e-05, | |
| "loss": 1.0296, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 9.842893600463867, | |
| "learning_rate": 8.642183288409704e-05, | |
| "loss": 0.9987, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.1824, | |
| "grad_norm": 8.999287605285645, | |
| "learning_rate": 8.608490566037735e-05, | |
| "loss": 0.9455, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.574797843665768e-05, | |
| "loss": 1.0243, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.1888, | |
| "grad_norm": 6.9205827713012695, | |
| "learning_rate": 8.541105121293801e-05, | |
| "loss": 0.9884, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 9.999999046325684, | |
| "learning_rate": 8.507412398921834e-05, | |
| "loss": 0.977, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.1952, | |
| "grad_norm": 9.999998092651367, | |
| "learning_rate": 8.473719676549866e-05, | |
| "loss": 0.9394, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 9.98512077331543, | |
| "learning_rate": 8.440026954177897e-05, | |
| "loss": 0.8862, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2016, | |
| "grad_norm": 6.723015785217285, | |
| "learning_rate": 8.40633423180593e-05, | |
| "loss": 0.9152, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 5.47459602355957, | |
| "learning_rate": 8.372641509433963e-05, | |
| "loss": 0.7909, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 5.321897029876709, | |
| "learning_rate": 8.338948787061996e-05, | |
| "loss": 0.8646, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 7.815572738647461, | |
| "learning_rate": 8.305256064690027e-05, | |
| "loss": 0.8641, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2144, | |
| "grad_norm": 4.958643436431885, | |
| "learning_rate": 8.271563342318059e-05, | |
| "loss": 0.7373, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 6.5276875495910645, | |
| "learning_rate": 8.237870619946092e-05, | |
| "loss": 0.698, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2208, | |
| "grad_norm": 6.228632926940918, | |
| "learning_rate": 8.204177897574125e-05, | |
| "loss": 0.784, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 7.006032943725586, | |
| "learning_rate": 8.170485175202158e-05, | |
| "loss": 0.6606, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2272, | |
| "grad_norm": 6.825442790985107, | |
| "learning_rate": 8.136792452830189e-05, | |
| "loss": 0.7276, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 6.571051120758057, | |
| "learning_rate": 8.103099730458221e-05, | |
| "loss": 0.6912, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.2336, | |
| "grad_norm": 4.326778411865234, | |
| "learning_rate": 8.069407008086254e-05, | |
| "loss": 0.7122, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 4.319221019744873, | |
| "learning_rate": 8.035714285714287e-05, | |
| "loss": 0.6474, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.887426853179932, | |
| "learning_rate": 8.002021563342318e-05, | |
| "loss": 0.6999, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 4.543856620788574, | |
| "learning_rate": 7.968328840970351e-05, | |
| "loss": 0.6467, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2464, | |
| "grad_norm": 4.395698070526123, | |
| "learning_rate": 7.934636118598383e-05, | |
| "loss": 0.6638, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 6.596084117889404, | |
| "learning_rate": 7.900943396226416e-05, | |
| "loss": 0.5995, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2528, | |
| "grad_norm": 8.952577590942383, | |
| "learning_rate": 7.867250673854449e-05, | |
| "loss": 0.5944, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 8.518202781677246, | |
| "learning_rate": 7.83355795148248e-05, | |
| "loss": 0.6457, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2592, | |
| "grad_norm": 8.593810081481934, | |
| "learning_rate": 7.799865229110512e-05, | |
| "loss": 0.6516, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 4.971993923187256, | |
| "learning_rate": 7.766172506738545e-05, | |
| "loss": 0.6265, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2656, | |
| "grad_norm": 7.201608657836914, | |
| "learning_rate": 7.732479784366577e-05, | |
| "loss": 0.5941, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 5.481573104858398, | |
| "learning_rate": 7.69878706199461e-05, | |
| "loss": 0.5637, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 5.261704444885254, | |
| "learning_rate": 7.665094339622642e-05, | |
| "loss": 0.5177, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 6.122642993927002, | |
| "learning_rate": 7.631401617250674e-05, | |
| "loss": 0.5685, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2784, | |
| "grad_norm": 4.627248764038086, | |
| "learning_rate": 7.597708894878706e-05, | |
| "loss": 0.5585, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 4.712826728820801, | |
| "learning_rate": 7.56401617250674e-05, | |
| "loss": 0.5825, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2848, | |
| "grad_norm": 7.80577278137207, | |
| "learning_rate": 7.530323450134771e-05, | |
| "loss": 0.5572, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 5.428683280944824, | |
| "learning_rate": 7.496630727762804e-05, | |
| "loss": 0.5302, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2912, | |
| "grad_norm": 6.566429615020752, | |
| "learning_rate": 7.462938005390835e-05, | |
| "loss": 0.5665, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 7.354272365570068, | |
| "learning_rate": 7.429245283018868e-05, | |
| "loss": 0.5625, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2976, | |
| "grad_norm": 5.946643352508545, | |
| "learning_rate": 7.395552560646901e-05, | |
| "loss": 0.5421, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 6.19100284576416, | |
| "learning_rate": 7.361859838274933e-05, | |
| "loss": 0.4312, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 4.216638565063477, | |
| "learning_rate": 7.328167115902966e-05, | |
| "loss": 0.474, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 4.427928447723389, | |
| "learning_rate": 7.294474393530997e-05, | |
| "loss": 0.465, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3104, | |
| "grad_norm": 4.779841423034668, | |
| "learning_rate": 7.26078167115903e-05, | |
| "loss": 0.4665, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 5.669528484344482, | |
| "learning_rate": 7.227088948787062e-05, | |
| "loss": 0.48, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3168, | |
| "grad_norm": 7.017117500305176, | |
| "learning_rate": 7.193396226415095e-05, | |
| "loss": 0.51, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.059611797332764, | |
| "learning_rate": 7.159703504043128e-05, | |
| "loss": 0.5019, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3232, | |
| "grad_norm": 4.046283721923828, | |
| "learning_rate": 7.126010781671159e-05, | |
| "loss": 0.4544, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 4.116858005523682, | |
| "learning_rate": 7.092318059299192e-05, | |
| "loss": 0.4836, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.3296, | |
| "grad_norm": 4.138635635375977, | |
| "learning_rate": 7.058625336927224e-05, | |
| "loss": 0.4327, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 5.3947577476501465, | |
| "learning_rate": 7.024932614555257e-05, | |
| "loss": 0.4197, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 4.98423957824707, | |
| "learning_rate": 6.99123989218329e-05, | |
| "loss": 0.4455, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 4.391843318939209, | |
| "learning_rate": 6.957547169811321e-05, | |
| "loss": 0.4911, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3424, | |
| "grad_norm": 3.1840767860412598, | |
| "learning_rate": 6.923854447439353e-05, | |
| "loss": 0.4314, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 4.046396732330322, | |
| "learning_rate": 6.890161725067386e-05, | |
| "loss": 0.4114, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3488, | |
| "grad_norm": 5.266067981719971, | |
| "learning_rate": 6.856469002695418e-05, | |
| "loss": 0.4386, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 5.4654951095581055, | |
| "learning_rate": 6.822776280323451e-05, | |
| "loss": 0.4145, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3552, | |
| "grad_norm": 3.72259521484375, | |
| "learning_rate": 6.789083557951483e-05, | |
| "loss": 0.4039, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 4.96457052230835, | |
| "learning_rate": 6.755390835579514e-05, | |
| "loss": 0.3989, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3616, | |
| "grad_norm": 3.831160068511963, | |
| "learning_rate": 6.721698113207547e-05, | |
| "loss": 0.3456, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 4.052297115325928, | |
| "learning_rate": 6.68800539083558e-05, | |
| "loss": 0.3827, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 5.16130256652832, | |
| "learning_rate": 6.654312668463612e-05, | |
| "loss": 0.3745, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 4.293144226074219, | |
| "learning_rate": 6.620619946091643e-05, | |
| "loss": 0.36, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3744, | |
| "grad_norm": 2.9887006282806396, | |
| "learning_rate": 6.586927223719676e-05, | |
| "loss": 0.3906, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 5.04908561706543, | |
| "learning_rate": 6.553234501347709e-05, | |
| "loss": 0.3718, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3808, | |
| "grad_norm": 4.748033046722412, | |
| "learning_rate": 6.519541778975742e-05, | |
| "loss": 0.3667, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 6.720622539520264, | |
| "learning_rate": 6.485849056603774e-05, | |
| "loss": 0.3187, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3872, | |
| "grad_norm": 5.302358150482178, | |
| "learning_rate": 6.452156334231805e-05, | |
| "loss": 0.3489, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 3.59417462348938, | |
| "learning_rate": 6.418463611859838e-05, | |
| "loss": 0.3638, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3936, | |
| "grad_norm": 6.106253147125244, | |
| "learning_rate": 6.384770889487871e-05, | |
| "loss": 0.3533, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 3.0482683181762695, | |
| "learning_rate": 6.351078167115904e-05, | |
| "loss": 0.3413, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.1857500076293945, | |
| "learning_rate": 6.317385444743936e-05, | |
| "loss": 0.3503, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 3.9182565212249756, | |
| "learning_rate": 6.283692722371967e-05, | |
| "loss": 0.3224, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.4064, | |
| "grad_norm": 3.7868616580963135, | |
| "learning_rate": 6.25e-05, | |
| "loss": 0.3237, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 3.272305965423584, | |
| "learning_rate": 6.216307277628033e-05, | |
| "loss": 0.3375, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4128, | |
| "grad_norm": 3.8790831565856934, | |
| "learning_rate": 6.182614555256066e-05, | |
| "loss": 0.3339, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 4.695404052734375, | |
| "learning_rate": 6.148921832884098e-05, | |
| "loss": 0.3089, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4192, | |
| "grad_norm": 3.2050020694732666, | |
| "learning_rate": 6.115229110512129e-05, | |
| "loss": 0.3289, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 3.027517557144165, | |
| "learning_rate": 6.081536388140162e-05, | |
| "loss": 0.3361, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4256, | |
| "grad_norm": 3.260744571685791, | |
| "learning_rate": 6.047843665768195e-05, | |
| "loss": 0.3012, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 2.5760984420776367, | |
| "learning_rate": 6.0141509433962265e-05, | |
| "loss": 0.2947, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 3.7423956394195557, | |
| "learning_rate": 5.980458221024259e-05, | |
| "loss": 0.2798, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 2.8880527019500732, | |
| "learning_rate": 5.9467654986522916e-05, | |
| "loss": 0.2573, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4384, | |
| "grad_norm": 3.848341464996338, | |
| "learning_rate": 5.913072776280324e-05, | |
| "loss": 0.2884, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 2.9584686756134033, | |
| "learning_rate": 5.879380053908357e-05, | |
| "loss": 0.26, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4448, | |
| "grad_norm": 3.3850250244140625, | |
| "learning_rate": 5.8456873315363884e-05, | |
| "loss": 0.2591, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 4.0127387046813965, | |
| "learning_rate": 5.8119946091644206e-05, | |
| "loss": 0.275, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4512, | |
| "grad_norm": 3.7733118534088135, | |
| "learning_rate": 5.7783018867924535e-05, | |
| "loss": 0.2892, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 2.6666953563690186, | |
| "learning_rate": 5.744609164420486e-05, | |
| "loss": 0.2691, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4576, | |
| "grad_norm": 3.1808416843414307, | |
| "learning_rate": 5.710916442048517e-05, | |
| "loss": 0.2745, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 4.617330074310303, | |
| "learning_rate": 5.6772237196765496e-05, | |
| "loss": 0.2663, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 3.740236520767212, | |
| "learning_rate": 5.6435309973045825e-05, | |
| "loss": 0.2528, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 3.2529256343841553, | |
| "learning_rate": 5.609838274932615e-05, | |
| "loss": 0.2554, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4704, | |
| "grad_norm": 3.2568247318267822, | |
| "learning_rate": 5.5761455525606476e-05, | |
| "loss": 0.2588, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 5.33850622177124, | |
| "learning_rate": 5.542452830188679e-05, | |
| "loss": 0.262, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4768, | |
| "grad_norm": 5.042779922485352, | |
| "learning_rate": 5.5087601078167114e-05, | |
| "loss": 0.2468, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.649498224258423, | |
| "learning_rate": 5.4750673854447444e-05, | |
| "loss": 0.2623, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4832, | |
| "grad_norm": 3.46178936958313, | |
| "learning_rate": 5.4413746630727766e-05, | |
| "loss": 0.2409, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 4.172382831573486, | |
| "learning_rate": 5.407681940700808e-05, | |
| "loss": 0.2404, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.4896, | |
| "grad_norm": 2.367910385131836, | |
| "learning_rate": 5.373989218328841e-05, | |
| "loss": 0.2308, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 2.469956398010254, | |
| "learning_rate": 5.340296495956873e-05, | |
| "loss": 0.2362, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 2.4397411346435547, | |
| "learning_rate": 5.306603773584906e-05, | |
| "loss": 0.2457, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 2.6383445262908936, | |
| "learning_rate": 5.2729110512129385e-05, | |
| "loss": 0.214, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5024, | |
| "grad_norm": 2.170884609222412, | |
| "learning_rate": 5.23921832884097e-05, | |
| "loss": 0.2055, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 2.846895217895508, | |
| "learning_rate": 5.205525606469003e-05, | |
| "loss": 0.2212, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5088, | |
| "grad_norm": 2.712366819381714, | |
| "learning_rate": 5.171832884097035e-05, | |
| "loss": 0.204, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 2.447471857070923, | |
| "learning_rate": 5.138140161725068e-05, | |
| "loss": 0.2047, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5152, | |
| "grad_norm": 2.5376291275024414, | |
| "learning_rate": 5.1044474393531e-05, | |
| "loss": 0.2323, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 3.2458057403564453, | |
| "learning_rate": 5.070754716981132e-05, | |
| "loss": 0.2043, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5216, | |
| "grad_norm": 2.34938645362854, | |
| "learning_rate": 5.037061994609165e-05, | |
| "loss": 0.2185, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 3.2154054641723633, | |
| "learning_rate": 5.003369272237197e-05, | |
| "loss": 0.1977, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 2.5128464698791504, | |
| "learning_rate": 4.969676549865229e-05, | |
| "loss": 0.1998, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 3.6363720893859863, | |
| "learning_rate": 4.9359838274932616e-05, | |
| "loss": 0.1978, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5344, | |
| "grad_norm": 1.650388479232788, | |
| "learning_rate": 4.902291105121294e-05, | |
| "loss": 0.1849, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 2.6597306728363037, | |
| "learning_rate": 4.868598382749327e-05, | |
| "loss": 0.1973, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5408, | |
| "grad_norm": 2.410196304321289, | |
| "learning_rate": 4.834905660377358e-05, | |
| "loss": 0.2012, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 2.417191743850708, | |
| "learning_rate": 4.801212938005391e-05, | |
| "loss": 0.1887, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5472, | |
| "grad_norm": 1.77193021774292, | |
| "learning_rate": 4.7675202156334234e-05, | |
| "loss": 0.1812, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 2.4603259563446045, | |
| "learning_rate": 4.733827493261456e-05, | |
| "loss": 0.1729, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5536, | |
| "grad_norm": 2.3087384700775146, | |
| "learning_rate": 4.7001347708894886e-05, | |
| "loss": 0.1799, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 3.3030126094818115, | |
| "learning_rate": 4.66644204851752e-05, | |
| "loss": 0.1971, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.3733720779418945, | |
| "learning_rate": 4.632749326145553e-05, | |
| "loss": 0.1855, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 1.909800410270691, | |
| "learning_rate": 4.5990566037735846e-05, | |
| "loss": 0.1619, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5664, | |
| "grad_norm": 3.0450034141540527, | |
| "learning_rate": 4.5653638814016176e-05, | |
| "loss": 0.1987, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 7.974597454071045, | |
| "learning_rate": 4.53167115902965e-05, | |
| "loss": 0.1902, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5728, | |
| "grad_norm": 3.595928907394409, | |
| "learning_rate": 4.497978436657682e-05, | |
| "loss": 0.1881, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 1.6789438724517822, | |
| "learning_rate": 4.464285714285715e-05, | |
| "loss": 0.1781, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5792, | |
| "grad_norm": 2.4755332469940186, | |
| "learning_rate": 4.4305929919137465e-05, | |
| "loss": 0.1682, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 2.7560534477233887, | |
| "learning_rate": 4.3969002695417794e-05, | |
| "loss": 0.1677, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5856, | |
| "grad_norm": 2.2446439266204834, | |
| "learning_rate": 4.363207547169812e-05, | |
| "loss": 0.1755, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 2.204357624053955, | |
| "learning_rate": 4.329514824797844e-05, | |
| "loss": 0.1704, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 2.05080509185791, | |
| "learning_rate": 4.295822102425876e-05, | |
| "loss": 0.1628, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 1.5275906324386597, | |
| "learning_rate": 4.2621293800539084e-05, | |
| "loss": 0.1726, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5984, | |
| "grad_norm": 2.030513048171997, | |
| "learning_rate": 4.2284366576819406e-05, | |
| "loss": 0.1674, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 2.6978938579559326, | |
| "learning_rate": 4.1947439353099736e-05, | |
| "loss": 0.1648, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6048, | |
| "grad_norm": 1.7440263032913208, | |
| "learning_rate": 4.161051212938006e-05, | |
| "loss": 0.1557, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 2.075267791748047, | |
| "learning_rate": 4.127358490566038e-05, | |
| "loss": 0.1636, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6112, | |
| "grad_norm": 2.1358184814453125, | |
| "learning_rate": 4.09366576819407e-05, | |
| "loss": 0.1657, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 1.5332633256912231, | |
| "learning_rate": 4.0599730458221025e-05, | |
| "loss": 0.1564, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6176, | |
| "grad_norm": 1.9167696237564087, | |
| "learning_rate": 4.026280323450135e-05, | |
| "loss": 0.1505, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 3.0243899822235107, | |
| "learning_rate": 3.992587601078167e-05, | |
| "loss": 0.15, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 2.9686858654022217, | |
| "learning_rate": 3.9588948787062e-05, | |
| "loss": 0.1503, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 2.0411455631256104, | |
| "learning_rate": 3.9252021563342315e-05, | |
| "loss": 0.1537, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6304, | |
| "grad_norm": 2.8846511840820312, | |
| "learning_rate": 3.8915094339622644e-05, | |
| "loss": 0.1447, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 2.098325490951538, | |
| "learning_rate": 3.8578167115902966e-05, | |
| "loss": 0.1507, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.6368, | |
| "grad_norm": 2.713503360748291, | |
| "learning_rate": 3.824123989218329e-05, | |
| "loss": 0.1505, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.709930658340454, | |
| "learning_rate": 3.790431266846362e-05, | |
| "loss": 0.1551, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6432, | |
| "grad_norm": 3.630295991897583, | |
| "learning_rate": 3.7567385444743934e-05, | |
| "loss": 0.1526, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 1.6541911363601685, | |
| "learning_rate": 3.723045822102426e-05, | |
| "loss": 0.1369, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6496, | |
| "grad_norm": 2.212904214859009, | |
| "learning_rate": 3.689353099730458e-05, | |
| "loss": 0.1445, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 2.0813424587249756, | |
| "learning_rate": 3.655660377358491e-05, | |
| "loss": 0.1463, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 3.0553150177001953, | |
| "learning_rate": 3.621967654986524e-05, | |
| "loss": 0.1441, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 2.165588140487671, | |
| "learning_rate": 3.588274932614555e-05, | |
| "loss": 0.1386, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6624, | |
| "grad_norm": 1.6694422960281372, | |
| "learning_rate": 3.554582210242588e-05, | |
| "loss": 0.1401, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 1.8330750465393066, | |
| "learning_rate": 3.52088948787062e-05, | |
| "loss": 0.1426, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6688, | |
| "grad_norm": 1.9006876945495605, | |
| "learning_rate": 3.4871967654986526e-05, | |
| "loss": 0.1325, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 2.488478660583496, | |
| "learning_rate": 3.453504043126685e-05, | |
| "loss": 0.1477, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6752, | |
| "grad_norm": 1.8582606315612793, | |
| "learning_rate": 3.419811320754717e-05, | |
| "loss": 0.1551, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 1.7370256185531616, | |
| "learning_rate": 3.3861185983827494e-05, | |
| "loss": 0.1527, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6816, | |
| "grad_norm": 1.926459550857544, | |
| "learning_rate": 3.3524258760107816e-05, | |
| "loss": 0.1459, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 1.9653120040893555, | |
| "learning_rate": 3.3187331536388145e-05, | |
| "loss": 0.1399, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 2.491975784301758, | |
| "learning_rate": 3.285040431266847e-05, | |
| "loss": 0.1409, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 2.253796339035034, | |
| "learning_rate": 3.251347708894879e-05, | |
| "loss": 0.1386, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6944, | |
| "grad_norm": 2.142688512802124, | |
| "learning_rate": 3.217654986522911e-05, | |
| "loss": 0.1254, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 2.1504247188568115, | |
| "learning_rate": 3.1839622641509435e-05, | |
| "loss": 0.1182, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7008, | |
| "grad_norm": 2.1450459957122803, | |
| "learning_rate": 3.150269541778976e-05, | |
| "loss": 0.1517, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 2.323805570602417, | |
| "learning_rate": 3.1165768194070086e-05, | |
| "loss": 0.1485, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7072, | |
| "grad_norm": 2.709195613861084, | |
| "learning_rate": 3.08288409703504e-05, | |
| "loss": 0.1369, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 1.8090829849243164, | |
| "learning_rate": 3.0491913746630728e-05, | |
| "loss": 0.1317, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7136, | |
| "grad_norm": 2.5093414783477783, | |
| "learning_rate": 3.0154986522911054e-05, | |
| "loss": 0.1217, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 1.427565336227417, | |
| "learning_rate": 2.9818059299191376e-05, | |
| "loss": 0.1335, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.4537243843078613, | |
| "learning_rate": 2.9481132075471702e-05, | |
| "loss": 0.1264, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 1.7222257852554321, | |
| "learning_rate": 2.914420485175202e-05, | |
| "loss": 0.1357, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7264, | |
| "grad_norm": 1.5699914693832397, | |
| "learning_rate": 2.8807277628032347e-05, | |
| "loss": 0.1282, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 1.597873330116272, | |
| "learning_rate": 2.847035040431267e-05, | |
| "loss": 0.1346, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7328, | |
| "grad_norm": 1.831920862197876, | |
| "learning_rate": 2.8133423180592995e-05, | |
| "loss": 0.1236, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 1.5934253931045532, | |
| "learning_rate": 2.7796495956873314e-05, | |
| "loss": 0.1307, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7392, | |
| "grad_norm": 1.5124200582504272, | |
| "learning_rate": 2.745956873315364e-05, | |
| "loss": 0.1345, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 2.2041866779327393, | |
| "learning_rate": 2.7122641509433965e-05, | |
| "loss": 0.1336, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7456, | |
| "grad_norm": 1.9419604539871216, | |
| "learning_rate": 2.6785714285714288e-05, | |
| "loss": 0.1355, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 2.395643949508667, | |
| "learning_rate": 2.6448787061994614e-05, | |
| "loss": 0.128, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 1.6096221208572388, | |
| "learning_rate": 2.6111859838274933e-05, | |
| "loss": 0.1278, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 1.5739423036575317, | |
| "learning_rate": 2.577493261455526e-05, | |
| "loss": 0.1153, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.7584, | |
| "grad_norm": 2.158097982406616, | |
| "learning_rate": 2.5438005390835577e-05, | |
| "loss": 0.1172, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 1.5097861289978027, | |
| "learning_rate": 2.5101078167115903e-05, | |
| "loss": 0.1262, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7648, | |
| "grad_norm": 1.9915529489517212, | |
| "learning_rate": 2.476415094339623e-05, | |
| "loss": 0.1316, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.4936026334762573, | |
| "learning_rate": 2.442722371967655e-05, | |
| "loss": 0.1271, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7712, | |
| "grad_norm": 1.2583588361740112, | |
| "learning_rate": 2.4090296495956874e-05, | |
| "loss": 0.1237, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 1.693161964416504, | |
| "learning_rate": 2.3753369272237196e-05, | |
| "loss": 0.1276, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7776, | |
| "grad_norm": 1.8149479627609253, | |
| "learning_rate": 2.341644204851752e-05, | |
| "loss": 0.1126, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 1.5526076555252075, | |
| "learning_rate": 2.3079514824797844e-05, | |
| "loss": 0.1128, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 8.050654411315918, | |
| "learning_rate": 2.274258760107817e-05, | |
| "loss": 0.3024, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 1.6099882125854492, | |
| "learning_rate": 2.2405660377358493e-05, | |
| "loss": 0.1187, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7904, | |
| "grad_norm": 1.8025749921798706, | |
| "learning_rate": 2.2068733153638815e-05, | |
| "loss": 0.1098, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 1.7324920892715454, | |
| "learning_rate": 2.1731805929919137e-05, | |
| "loss": 0.1292, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7968, | |
| "grad_norm": 2.0919933319091797, | |
| "learning_rate": 2.1394878706199463e-05, | |
| "loss": 0.114, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.464938998222351, | |
| "learning_rate": 2.1057951482479785e-05, | |
| "loss": 0.1258, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8032, | |
| "grad_norm": 1.8821276426315308, | |
| "learning_rate": 2.0721024258760108e-05, | |
| "loss": 0.118, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 1.4615644216537476, | |
| "learning_rate": 2.038409703504043e-05, | |
| "loss": 0.1147, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8096, | |
| "grad_norm": 1.6426907777786255, | |
| "learning_rate": 2.0047169811320756e-05, | |
| "loss": 0.1143, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 2.4523229598999023, | |
| "learning_rate": 1.971024258760108e-05, | |
| "loss": 0.1135, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 1.6785757541656494, | |
| "learning_rate": 1.9373315363881404e-05, | |
| "loss": 0.1245, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 1.503734827041626, | |
| "learning_rate": 1.9036388140161727e-05, | |
| "loss": 0.1156, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8224, | |
| "grad_norm": 1.5107955932617188, | |
| "learning_rate": 1.869946091644205e-05, | |
| "loss": 0.1136, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 2.7832164764404297, | |
| "learning_rate": 1.836253369272237e-05, | |
| "loss": 0.1151, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8288, | |
| "grad_norm": 1.5118536949157715, | |
| "learning_rate": 1.8025606469002694e-05, | |
| "loss": 0.1147, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 1.9129774570465088, | |
| "learning_rate": 1.768867924528302e-05, | |
| "loss": 0.1062, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8352, | |
| "grad_norm": 1.3641362190246582, | |
| "learning_rate": 1.7351752021563345e-05, | |
| "loss": 0.1119, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 1.5150868892669678, | |
| "learning_rate": 1.7014824797843668e-05, | |
| "loss": 0.1047, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8416, | |
| "grad_norm": 1.4528048038482666, | |
| "learning_rate": 1.667789757412399e-05, | |
| "loss": 0.1165, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 1.6915483474731445, | |
| "learning_rate": 1.6340970350404313e-05, | |
| "loss": 0.1238, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 2.2077555656433105, | |
| "learning_rate": 1.600404312668464e-05, | |
| "loss": 0.1022, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 3.6118428707122803, | |
| "learning_rate": 1.566711590296496e-05, | |
| "loss": 0.1087, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8544, | |
| "grad_norm": 2.0867350101470947, | |
| "learning_rate": 1.5330188679245283e-05, | |
| "loss": 0.1102, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 2.128396511077881, | |
| "learning_rate": 1.4993261455525606e-05, | |
| "loss": 0.1058, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8608, | |
| "grad_norm": 1.7146245241165161, | |
| "learning_rate": 1.465633423180593e-05, | |
| "loss": 0.1063, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 2.5430219173431396, | |
| "learning_rate": 1.4319407008086256e-05, | |
| "loss": 0.118, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8672, | |
| "grad_norm": 1.9589694738388062, | |
| "learning_rate": 1.3982479784366578e-05, | |
| "loss": 0.1072, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 2.741550922393799, | |
| "learning_rate": 1.3645552560646902e-05, | |
| "loss": 0.1298, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8736, | |
| "grad_norm": 2.003413677215576, | |
| "learning_rate": 1.3308625336927224e-05, | |
| "loss": 0.1033, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 1.9745001792907715, | |
| "learning_rate": 1.2971698113207547e-05, | |
| "loss": 0.1115, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.7120094299316406, | |
| "learning_rate": 1.2634770889487871e-05, | |
| "loss": 0.1042, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 2.5961086750030518, | |
| "learning_rate": 1.2297843665768195e-05, | |
| "loss": 0.1105, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8864, | |
| "grad_norm": 1.581310510635376, | |
| "learning_rate": 1.1960916442048519e-05, | |
| "loss": 0.1019, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 1.422912836074829, | |
| "learning_rate": 1.1623989218328842e-05, | |
| "loss": 0.0936, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8928, | |
| "grad_norm": 1.8189693689346313, | |
| "learning_rate": 1.1287061994609164e-05, | |
| "loss": 0.1019, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.2943332195281982, | |
| "learning_rate": 1.0950134770889488e-05, | |
| "loss": 0.1014, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8992, | |
| "grad_norm": 1.7528398036956787, | |
| "learning_rate": 1.0613207547169812e-05, | |
| "loss": 0.1075, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 2.2377705574035645, | |
| "learning_rate": 1.0276280323450135e-05, | |
| "loss": 0.1206, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9056, | |
| "grad_norm": 1.8247408866882324, | |
| "learning_rate": 9.939353099730459e-06, | |
| "loss": 0.1096, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 1.8777835369110107, | |
| "learning_rate": 9.602425876010781e-06, | |
| "loss": 0.1104, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 1.9983779191970825, | |
| "learning_rate": 9.265498652291107e-06, | |
| "loss": 0.1041, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 1.6120233535766602, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.1067, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.9184, | |
| "grad_norm": 1.5283420085906982, | |
| "learning_rate": 8.591644204851752e-06, | |
| "loss": 0.1083, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 1.3929494619369507, | |
| "learning_rate": 8.254716981132076e-06, | |
| "loss": 0.1004, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.9248, | |
| "grad_norm": 1.3750373125076294, | |
| "learning_rate": 7.9177897574124e-06, | |
| "loss": 0.101, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 1.5251433849334717, | |
| "learning_rate": 7.580862533692723e-06, | |
| "loss": 0.1053, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9312, | |
| "grad_norm": 1.520762324333191, | |
| "learning_rate": 7.243935309973046e-06, | |
| "loss": 0.0976, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 1.3639968633651733, | |
| "learning_rate": 6.90700808625337e-06, | |
| "loss": 0.0997, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9376, | |
| "grad_norm": 1.41064453125, | |
| "learning_rate": 6.570080862533692e-06, | |
| "loss": 0.0947, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 1.3565398454666138, | |
| "learning_rate": 6.233153638814016e-06, | |
| "loss": 0.1059, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 3.0992274284362793, | |
| "learning_rate": 5.89622641509434e-06, | |
| "loss": 0.1064, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 2.12271785736084, | |
| "learning_rate": 5.5592991913746634e-06, | |
| "loss": 0.0936, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9504, | |
| "grad_norm": 2.1983823776245117, | |
| "learning_rate": 5.222371967654987e-06, | |
| "loss": 0.1025, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 2.5740675926208496, | |
| "learning_rate": 4.88544474393531e-06, | |
| "loss": 0.1032, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9568, | |
| "grad_norm": 2.195711851119995, | |
| "learning_rate": 4.548517520215634e-06, | |
| "loss": 0.0982, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.1637061834335327, | |
| "learning_rate": 4.211590296495957e-06, | |
| "loss": 0.0955, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9632, | |
| "grad_norm": 2.551534414291382, | |
| "learning_rate": 3.8746630727762805e-06, | |
| "loss": 0.1055, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 1.514855980873108, | |
| "learning_rate": 3.5377358490566038e-06, | |
| "loss": 0.1045, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9696, | |
| "grad_norm": 1.692115068435669, | |
| "learning_rate": 3.200808625336928e-06, | |
| "loss": 0.098, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 2.3038151264190674, | |
| "learning_rate": 2.8638814016172507e-06, | |
| "loss": 0.104, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 2.007957935333252, | |
| "learning_rate": 2.5269541778975744e-06, | |
| "loss": 0.0956, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 1.778769612312317, | |
| "learning_rate": 2.1900269541778976e-06, | |
| "loss": 0.1019, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9824, | |
| "grad_norm": 1.8261606693267822, | |
| "learning_rate": 1.853099730458221e-06, | |
| "loss": 0.1024, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 1.3561267852783203, | |
| "learning_rate": 1.5161725067385445e-06, | |
| "loss": 0.0995, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.9888, | |
| "grad_norm": 1.6872135400772095, | |
| "learning_rate": 1.179245283018868e-06, | |
| "loss": 0.0984, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 1.2371549606323242, | |
| "learning_rate": 8.423180592991913e-07, | |
| "loss": 0.098, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9952, | |
| "grad_norm": 2.050222635269165, | |
| "learning_rate": 5.053908355795148e-07, | |
| "loss": 0.1094, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 1.5597504377365112, | |
| "learning_rate": 1.684636118598383e-07, | |
| "loss": 0.0958, | |
| "step": 3120 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3125, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |