| { | |
| "best_metric": 0.7931236608249025, | |
| "best_model_checkpoint": "./models/checkpoint-1500", | |
| "epoch": 1.0053619302949062, | |
| "eval_steps": 500, | |
| "global_step": 1500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003351206434316354, | |
| "grad_norm": 248.0, | |
| "learning_rate": 5e-06, | |
| "loss": 12.9737, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.006702412868632708, | |
| "grad_norm": 205.0, | |
| "learning_rate": 1e-05, | |
| "loss": 12.2905, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010053619302949061, | |
| "grad_norm": 78.5, | |
| "learning_rate": 1.5e-05, | |
| "loss": 10.8747, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.013404825737265416, | |
| "grad_norm": 45.25, | |
| "learning_rate": 2e-05, | |
| "loss": 9.7048, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01675603217158177, | |
| "grad_norm": 35.0, | |
| "learning_rate": 2.5e-05, | |
| "loss": 8.8715, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.020107238605898123, | |
| "grad_norm": 30.0, | |
| "learning_rate": 3e-05, | |
| "loss": 8.1936, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.023458445040214475, | |
| "grad_norm": 29.5, | |
| "learning_rate": 3.5e-05, | |
| "loss": 7.5312, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.02680965147453083, | |
| "grad_norm": 35.75, | |
| "learning_rate": 4e-05, | |
| "loss": 6.9812, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.030160857908847184, | |
| "grad_norm": 47.75, | |
| "learning_rate": 4.5e-05, | |
| "loss": 6.2436, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.03351206434316354, | |
| "grad_norm": 25.125, | |
| "learning_rate": 5e-05, | |
| "loss": 5.7614, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03686327077747989, | |
| "grad_norm": 19.625, | |
| "learning_rate": 5.500000000000001e-05, | |
| "loss": 5.6177, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.040214477211796246, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 6e-05, | |
| "loss": 5.3004, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0435656836461126, | |
| "grad_norm": 21.375, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 4.9812, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04691689008042895, | |
| "grad_norm": 14.6875, | |
| "learning_rate": 7e-05, | |
| "loss": 4.827, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05026809651474531, | |
| "grad_norm": 15.6875, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 4.4671, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.05361930294906166, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 8e-05, | |
| "loss": 4.5079, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05697050938337802, | |
| "grad_norm": 15.8125, | |
| "learning_rate": 8.5e-05, | |
| "loss": 4.2763, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.06032171581769437, | |
| "grad_norm": 18.625, | |
| "learning_rate": 9e-05, | |
| "loss": 4.1637, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06367292225201072, | |
| "grad_norm": 15.125, | |
| "learning_rate": 9.5e-05, | |
| "loss": 4.0087, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.06702412868632708, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.0001, | |
| "loss": 3.9387, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07037533512064344, | |
| "grad_norm": 14.0, | |
| "learning_rate": 0.000105, | |
| "loss": 3.9187, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.07372654155495978, | |
| "grad_norm": 12.0, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 3.808, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07707774798927614, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 0.00011499999999999999, | |
| "loss": 3.6024, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.08042895442359249, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 0.00012, | |
| "loss": 3.6723, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08378016085790885, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 0.000125, | |
| "loss": 3.5476, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0871313672922252, | |
| "grad_norm": 11.0, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 3.4838, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09048257372654156, | |
| "grad_norm": 10.375, | |
| "learning_rate": 0.00013500000000000003, | |
| "loss": 3.5531, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0938337801608579, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.00014, | |
| "loss": 3.4721, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09718498659517426, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.000145, | |
| "loss": 3.4215, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.10053619302949061, | |
| "grad_norm": 11.5, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 3.5449, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10388739946380697, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.000155, | |
| "loss": 3.3309, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.10723860589812333, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.00016, | |
| "loss": 3.3066, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11058981233243968, | |
| "grad_norm": 13.25, | |
| "learning_rate": 0.000165, | |
| "loss": 3.3844, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.11394101876675604, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 0.00017, | |
| "loss": 3.2758, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11729222520107238, | |
| "grad_norm": 10.75, | |
| "learning_rate": 0.000175, | |
| "loss": 3.4125, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.12064343163538874, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.00018, | |
| "loss": 3.0847, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1239946380697051, | |
| "grad_norm": 23.5, | |
| "learning_rate": 0.00018500000000000002, | |
| "loss": 3.1467, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.12734584450402145, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 0.00019, | |
| "loss": 3.1672, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1306970509383378, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 0.000195, | |
| "loss": 3.2039, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.13404825737265416, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 3.2238, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13739946380697052, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 0.00019999619230641713, | |
| "loss": 3.1889, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.14075067024128687, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 0.00019998476951563915, | |
| "loss": 3.1551, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14410187667560323, | |
| "grad_norm": 11.0, | |
| "learning_rate": 0.00019996573249755572, | |
| "loss": 3.1229, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.14745308310991956, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.0001999390827019096, | |
| "loss": 3.0822, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15080428954423591, | |
| "grad_norm": 7.875, | |
| "learning_rate": 0.0001999048221581858, | |
| "loss": 2.9755, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.15415549597855227, | |
| "grad_norm": 8.5, | |
| "learning_rate": 0.0001998629534754574, | |
| "loss": 3.0043, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15750670241286863, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 0.0001998134798421867, | |
| "loss": 2.9992, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.16085790884718498, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 0.00019975640502598244, | |
| "loss": 3.0183, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16420911528150134, | |
| "grad_norm": 10.875, | |
| "learning_rate": 0.0001996917333733128, | |
| "loss": 3.004, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.1675603217158177, | |
| "grad_norm": 7.75, | |
| "learning_rate": 0.00019961946980917456, | |
| "loss": 3.0863, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.17091152815013405, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 0.00019953961983671788, | |
| "loss": 2.7425, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.1742627345844504, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 0.00019945218953682734, | |
| "loss": 3.0072, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17761394101876676, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 0.00019935718556765876, | |
| "loss": 2.9687, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.18096514745308312, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 0.00019925461516413223, | |
| "loss": 2.9539, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18431635388739948, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.00019914448613738106, | |
| "loss": 2.8877, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.1876675603217158, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 0.00019902680687415705, | |
| "loss": 2.8264, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.19101876675603216, | |
| "grad_norm": 6.375, | |
| "learning_rate": 0.0001989015863361917, | |
| "loss": 2.8201, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.19436997319034852, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 0.00019876883405951377, | |
| "loss": 2.8141, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19772117962466487, | |
| "grad_norm": 7.0, | |
| "learning_rate": 0.00019862856015372317, | |
| "loss": 2.9018, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.20107238605898123, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 0.00019848077530122083, | |
| "loss": 2.8597, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.20442359249329758, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 0.0001983254907563955, | |
| "loss": 2.8288, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.20777479892761394, | |
| "grad_norm": 21.375, | |
| "learning_rate": 0.00019816271834476642, | |
| "loss": 2.8927, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2111260053619303, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 0.00019799247046208297, | |
| "loss": 2.6922, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.21447721179624665, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 0.00019781476007338058, | |
| "loss": 2.9288, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.217828418230563, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 0.00019762960071199333, | |
| "loss": 2.7749, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.22117962466487937, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 0.00019743700647852354, | |
| "loss": 2.725, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.22453083109919572, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.00019723699203976766, | |
| "loss": 2.9425, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.22788203753351208, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 0.00019702957262759965, | |
| "loss": 2.744, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2312332439678284, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 0.0001968147640378108, | |
| "loss": 2.6849, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.23458445040214476, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 0.00019659258262890683, | |
| "loss": 2.9359, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.23793565683646112, | |
| "grad_norm": 8.0, | |
| "learning_rate": 0.0001963630453208623, | |
| "loss": 2.7998, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.24128686327077747, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 0.0001961261695938319, | |
| "loss": 2.6601, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.24463806970509383, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 0.0001958819734868193, | |
| "loss": 2.6936, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.2479892761394102, | |
| "grad_norm": 5.25, | |
| "learning_rate": 0.00019563047559630357, | |
| "loss": 2.7705, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.25134048257372654, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 0.0001953716950748227, | |
| "loss": 2.8504, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2546916890080429, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 0.00019510565162951537, | |
| "loss": 2.5577, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.25804289544235925, | |
| "grad_norm": 5.5, | |
| "learning_rate": 0.00019483236552061994, | |
| "loss": 2.7717, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.2613941018766756, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 0.0001945518575599317, | |
| "loss": 2.6139, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.26474530831099197, | |
| "grad_norm": 60.75, | |
| "learning_rate": 0.00019426414910921787, | |
| "loss": 2.7684, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.2680965147453083, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 0.00019396926207859084, | |
| "loss": 2.6009, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2714477211796247, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 0.00019366721892483978, | |
| "loss": 2.6487, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.27479892761394104, | |
| "grad_norm": 4.625, | |
| "learning_rate": 0.00019335804264972018, | |
| "loss": 2.5787, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2781501340482574, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 0.00019304175679820247, | |
| "loss": 2.5201, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.28150134048257375, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.00019271838545667876, | |
| "loss": 2.575, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2848525469168901, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 0.0001923879532511287, | |
| "loss": 2.4937, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.28820375335120646, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 0.00019205048534524406, | |
| "loss": 2.6559, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2915549597855228, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 0.0001917060074385124, | |
| "loss": 2.529, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2949061662198391, | |
| "grad_norm": 4.125, | |
| "learning_rate": 0.0001913545457642601, | |
| "loss": 2.6327, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2982573726541555, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 0.00019099612708765434, | |
| "loss": 2.6676, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.30160857908847183, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 0.000190630778703665, | |
| "loss": 2.531, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3049597855227882, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 0.00019025852843498607, | |
| "loss": 2.586, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.30831099195710454, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 0.0001898794046299167, | |
| "loss": 2.6193, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3116621983914209, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.00018949343616020252, | |
| "loss": 2.5805, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.31501340482573725, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 0.0001891006524188368, | |
| "loss": 2.6252, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3183646112600536, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 0.00018870108331782217, | |
| "loss": 2.5582, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.32171581769436997, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 0.00018829475928589271, | |
| "loss": 2.6244, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3250670241286863, | |
| "grad_norm": 31.375, | |
| "learning_rate": 0.00018788171126619653, | |
| "loss": 2.651, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.3284182305630027, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.00018746197071393958, | |
| "loss": 2.5153, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.33176943699731903, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 0.00018703556959398998, | |
| "loss": 2.6416, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.3351206434316354, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 2.5965, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3351206434316354, | |
| "eval_128_ap": 0.6946773245581791, | |
| "eval_128_auc": 0.9680845837539295, | |
| "eval_128_loss": 2.086740016937256, | |
| "eval_128_runtime": 19.4102, | |
| "eval_128_samples_per_second": 20.608, | |
| "eval_128_steps_per_second": 5.152, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.33847184986595175, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 0.00018616291604415258, | |
| "loss": 2.435, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.3418230563002681, | |
| "grad_norm": 4.875, | |
| "learning_rate": 0.00018571673007021123, | |
| "loss": 2.5466, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.34517426273458446, | |
| "grad_norm": 3.875, | |
| "learning_rate": 0.00018526401643540922, | |
| "loss": 2.6, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3485254691689008, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 0.0001848048096156426, | |
| "loss": 2.4788, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.35187667560321717, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 0.0001843391445812886, | |
| "loss": 2.4897, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.3552278820375335, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 0.00018386705679454242, | |
| "loss": 2.3744, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3585790884718499, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 0.00018338858220671682, | |
| "loss": 2.5395, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.36193029490616624, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 0.00018290375725550417, | |
| "loss": 2.4539, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3652815013404826, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 0.00018241261886220154, | |
| "loss": 2.4642, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.36863270777479895, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 0.0001819152044288992, | |
| "loss": 2.5463, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3719839142091153, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 0.00018141155183563193, | |
| "loss": 2.4316, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3753351206434316, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 0.00018090169943749476, | |
| "loss": 2.4111, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.37868632707774796, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 0.00018038568606172173, | |
| "loss": 2.5699, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3820375335120643, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 0.00017986355100472928, | |
| "loss": 2.3959, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3853887399463807, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 0.00017933533402912354, | |
| "loss": 2.4525, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.38873994638069703, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.00017880107536067218, | |
| "loss": 2.5208, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3920911528150134, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 0.0001782608156852414, | |
| "loss": 2.482, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.39544235924932974, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.0001777145961456971, | |
| "loss": 2.4212, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3987935656836461, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 0.00017716245833877201, | |
| "loss": 2.4238, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.40214477211796246, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 0.0001766044443118978, | |
| "loss": 2.4752, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4054959785522788, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 0.0001760405965600031, | |
| "loss": 2.4592, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.40884718498659517, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 0.00017547095802227723, | |
| "loss": 2.3419, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4121983914209115, | |
| "grad_norm": 3.75, | |
| "learning_rate": 0.00017489557207890023, | |
| "loss": 2.4173, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.4155495978552279, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 0.00017431448254773944, | |
| "loss": 2.4782, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.41890080428954424, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 0.0001737277336810124, | |
| "loss": 2.4608, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.4222520107238606, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 0.00017313537016191706, | |
| "loss": 2.3687, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.42560321715817695, | |
| "grad_norm": 4.0, | |
| "learning_rate": 0.00017253743710122878, | |
| "loss": 2.5249, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.4289544235924933, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.0001719339800338651, | |
| "loss": 2.3771, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.43230563002680966, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 0.00017132504491541818, | |
| "loss": 2.5041, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.435656836461126, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 0.00017071067811865476, | |
| "loss": 2.2476, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4390080428954424, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 0.0001700909264299851, | |
| "loss": 2.4838, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.44235924932975873, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 0.00016946583704589973, | |
| "loss": 2.3025, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4457104557640751, | |
| "grad_norm": 6.125, | |
| "learning_rate": 0.0001688354575693754, | |
| "loss": 2.3728, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.44906166219839144, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 0.00016819983600624986, | |
| "loss": 2.3742, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4524128686327078, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 0.00016755902076156604, | |
| "loss": 2.5019, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.45576407506702415, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 0.00016691306063588583, | |
| "loss": 2.4063, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.45911528150134046, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 0.00016626200482157378, | |
| "loss": 2.3548, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.4624664879356568, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.00016560590289905073, | |
| "loss": 2.3674, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.46581769436997317, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 0.00016494480483301836, | |
| "loss": 2.3071, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.4691689008042895, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 0.00016427876096865394, | |
| "loss": 2.3699, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4725201072386059, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 0.0001636078220277764, | |
| "loss": 2.2788, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.47587131367292224, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 0.00016293203910498376, | |
| "loss": 2.2637, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4792225201072386, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00016225146366376198, | |
| "loss": 2.3791, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.48257372654155495, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 0.00016156614753256584, | |
| "loss": 2.2704, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4859249329758713, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 0.00016087614290087208, | |
| "loss": 2.3598, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.48927613941018766, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 0.00016018150231520483, | |
| "loss": 2.4334, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.492627345844504, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00015948227867513415, | |
| "loss": 2.434, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.4959785522788204, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 0.00015877852522924732, | |
| "loss": 2.3569, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.49932975871313673, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 0.00015807029557109398, | |
| "loss": 2.3909, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.5026809651474531, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 0.0001573576436351046, | |
| "loss": 2.3255, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5060321715817694, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 0.00015664062369248328, | |
| "loss": 2.2162, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.5093833780160858, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 0.0001559192903470747, | |
| "loss": 2.2311, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5127345844504021, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 0.0001551936985312058, | |
| "loss": 2.2553, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.5160857908847185, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 0.0001544639035015027, | |
| "loss": 2.3287, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5194369973190348, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 0.0001537299608346824, | |
| "loss": 2.3142, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.5227882037533512, | |
| "grad_norm": 3.375, | |
| "learning_rate": 0.0001529919264233205, | |
| "loss": 2.3647, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5261394101876675, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 0.0001522498564715949, | |
| "loss": 2.2755, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.5294906166219839, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 0.00015150380749100545, | |
| "loss": 2.1969, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5328418230563002, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 0.00015075383629607042, | |
| "loss": 2.3177, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.5361930294906166, | |
| "grad_norm": 3.875, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 2.2585, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.539544235924933, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 0.00014924235601034672, | |
| "loss": 2.3202, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.5428954423592494, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00014848096202463372, | |
| "loss": 2.2391, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5462466487935657, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 0.00014771587602596084, | |
| "loss": 2.1343, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.5495978552278821, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 0.00014694715627858908, | |
| "loss": 2.2725, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5529490616621984, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 0.00014617486132350343, | |
| "loss": 2.1359, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.5563002680965148, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 0.00014539904997395468, | |
| "loss": 2.081, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5596514745308311, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 0.00014461978131098088, | |
| "loss": 2.1051, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.5630026809651475, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 0.00014383711467890774, | |
| "loss": 2.1229, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5663538873994638, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 0.00014305110968082952, | |
| "loss": 2.2683, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.5697050938337802, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 0.00014226182617406996, | |
| "loss": 2.1467, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5730563002680965, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 0.00014146932426562392, | |
| "loss": 2.1038, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5764075067024129, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 0.00014067366430758004, | |
| "loss": 2.3725, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5797587131367292, | |
| "grad_norm": 5.125, | |
| "learning_rate": 0.00013987490689252463, | |
| "loss": 2.1452, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.5831099195710456, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 0.00013907311284892736, | |
| "loss": 2.1279, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5864611260053619, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 0.000138268343236509, | |
| "loss": 2.2028, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5898123324396782, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 0.00013746065934159123, | |
| "loss": 2.2621, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5931635388739946, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 0.00013665012267242974, | |
| "loss": 2.3162, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.596514745308311, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 0.00013583679495453, | |
| "loss": 2.2635, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5998659517426274, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00013502073812594675, | |
| "loss": 2.2185, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.6032171581769437, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 0.00013420201433256689, | |
| "loss": 2.1357, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6065683646112601, | |
| "grad_norm": 3.875, | |
| "learning_rate": 0.0001333806859233771, | |
| "loss": 2.1906, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.6099195710455764, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 0.00013255681544571568, | |
| "loss": 2.1371, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6132707774798928, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 0.00013173046564050924, | |
| "loss": 2.1377, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.6166219839142091, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 0.00013090169943749476, | |
| "loss": 2.1863, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6199731903485255, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 0.00013007057995042732, | |
| "loss": 2.1153, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.6233243967828418, | |
| "grad_norm": 3.125, | |
| "learning_rate": 0.00012923717047227368, | |
| "loss": 2.1994, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6266756032171582, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 0.00012840153447039228, | |
| "loss": 2.205, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.6300268096514745, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 0.0001275637355816999, | |
| "loss": 2.1964, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6333780160857909, | |
| "grad_norm": 3.625, | |
| "learning_rate": 0.00012672383760782568, | |
| "loss": 2.1978, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.6367292225201072, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 0.00012588190451025207, | |
| "loss": 2.1862, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6400804289544236, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 0.00012503800040544416, | |
| "loss": 2.1544, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.6434316353887399, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 0.00012419218955996676, | |
| "loss": 2.1247, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6467828418230563, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 0.00012334453638559057, | |
| "loss": 2.132, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.6501340482573726, | |
| "grad_norm": 3.5, | |
| "learning_rate": 0.0001224951054343865, | |
| "loss": 2.0192, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.653485254691689, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 0.00012164396139381029, | |
| "loss": 2.0863, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.6568364611260054, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 0.00012079116908177593, | |
| "loss": 2.162, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6601876675603218, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 0.00011993679344171973, | |
| "loss": 2.2546, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.6635388739946381, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 0.00011908089953765449, | |
| "loss": 2.1244, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6668900804289544, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 0.00011822355254921478, | |
| "loss": 2.1339, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.6702412868632708, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 2.1744, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6702412868632708, | |
| "eval_128_ap": 0.7636418814117207, | |
| "eval_128_auc": 0.9774185615872945, | |
| "eval_128_loss": 1.7800103425979614, | |
| "eval_128_runtime": 21.8273, | |
| "eval_128_samples_per_second": 18.326, | |
| "eval_128_steps_per_second": 4.581, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6735924932975871, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 0.00011650476058606777, | |
| "loss": 1.9784, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.6769436997319035, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 0.0001156434465040231, | |
| "loss": 1.9891, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6802949061662198, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 0.00011478094111296109, | |
| "loss": 2.0137, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.6836461126005362, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 0.00011391731009600654, | |
| "loss": 2.0556, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6869973190348525, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 0.00011305261922200519, | |
| "loss": 2.0577, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6903485254691689, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 0.00011218693434051475, | |
| "loss": 2.0269, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6936997319034852, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 0.0001113203213767907, | |
| "loss": 2.0982, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.6970509383378016, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 0.00011045284632676536, | |
| "loss": 2.1156, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.7004021447721179, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 0.00010958457525202241, | |
| "loss": 2.0988, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.7037533512064343, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 0.00010871557427476583, | |
| "loss": 2.0687, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7071045576407506, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 0.0001078459095727845, | |
| "loss": 2.0488, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.710455764075067, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 0.00010697564737441252, | |
| "loss": 2.1562, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.7138069705093834, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 0.00010610485395348571, | |
| "loss": 2.0119, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.7171581769436998, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 0.0001052335956242944, | |
| "loss": 2.0686, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7205093833780161, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 0.00010436193873653361, | |
| "loss": 2.0053, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.7238605898123325, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 0.00010348994967025012, | |
| "loss": 2.0279, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7272117962466488, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 0.00010261769483078733, | |
| "loss": 2.2239, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.7305630026809652, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 0.00010174524064372837, | |
| "loss": 2.0663, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.7339142091152815, | |
| "grad_norm": 3.25, | |
| "learning_rate": 0.0001008726535498374, | |
| "loss": 2.1292, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.7372654155495979, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0677, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7406166219839142, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 9.912734645016263e-05, | |
| "loss": 1.9551, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.7439678284182306, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 9.825475935627165e-05, | |
| "loss": 2.0803, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7473190348525469, | |
| "grad_norm": 3.25, | |
| "learning_rate": 9.73823051692127e-05, | |
| "loss": 2.114, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.7506702412868632, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 9.651005032974994e-05, | |
| "loss": 2.0298, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7540214477211796, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 9.563806126346642e-05, | |
| "loss": 2.15, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.7573726541554959, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 9.476640437570562e-05, | |
| "loss": 2.0435, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7607238605898123, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 9.38951460465143e-05, | |
| "loss": 2.0872, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.7640750670241286, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 9.302435262558747e-05, | |
| "loss": 1.981, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.767426273458445, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 9.215409042721552e-05, | |
| "loss": 1.9386, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.7707774798927614, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 9.128442572523417e-05, | |
| "loss": 2.0754, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7741286863270778, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 9.04154247479776e-05, | |
| "loss": 2.0129, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.7774798927613941, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 8.954715367323468e-05, | |
| "loss": 2.099, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7808310991957105, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 8.867967862320934e-05, | |
| "loss": 2.055, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.7841823056300268, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 8.781306565948528e-05, | |
| "loss": 1.9264, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7875335120643432, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 8.694738077799488e-05, | |
| "loss": 1.9382, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.7908847184986595, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 8.608268990399349e-05, | |
| "loss": 1.9913, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7942359249329759, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 8.521905888703893e-05, | |
| "loss": 2.2202, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.7975871313672922, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 8.435655349597689e-05, | |
| "loss": 1.9226, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.8009383378016086, | |
| "grad_norm": 3.375, | |
| "learning_rate": 8.349523941393224e-05, | |
| "loss": 1.9842, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.8042895442359249, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 8.263518223330697e-05, | |
| "loss": 2.0414, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8076407506702413, | |
| "grad_norm": 3.375, | |
| "learning_rate": 8.177644745078526e-05, | |
| "loss": 1.9747, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.8109919571045576, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 8.091910046234552e-05, | |
| "loss": 2.1483, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.814343163538874, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 8.00632065582803e-05, | |
| "loss": 2.003, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.8176943699731903, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 7.920883091822408e-05, | |
| "loss": 1.9795, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.8210455764075067, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 7.835603860618972e-05, | |
| "loss": 1.9079, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.824396782841823, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 7.750489456561352e-05, | |
| "loss": 1.9086, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.8277479892761395, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 7.66554636144095e-05, | |
| "loss": 2.138, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.8310991957104558, | |
| "grad_norm": 3.125, | |
| "learning_rate": 7.580781044003324e-05, | |
| "loss": 1.9724, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8344504021447721, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 7.496199959455584e-05, | |
| "loss": 2.0067, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.8378016085790885, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 7.411809548974792e-05, | |
| "loss": 1.9761, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8411528150134048, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 7.327616239217431e-05, | |
| "loss": 1.9118, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.8445040214477212, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 7.243626441830009e-05, | |
| "loss": 1.992, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8478552278820375, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 7.159846552960774e-05, | |
| "loss": 1.9095, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.8512064343163539, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 7.076282952772633e-05, | |
| "loss": 1.9637, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8545576407506702, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 6.992942004957271e-05, | |
| "loss": 1.9976, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.8579088471849866, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 6.909830056250527e-05, | |
| "loss": 2.0146, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8612600536193029, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 6.826953435949081e-05, | |
| "loss": 1.9493, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.8646112600536193, | |
| "grad_norm": 3.25, | |
| "learning_rate": 6.744318455428436e-05, | |
| "loss": 2.0348, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8679624664879356, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 6.661931407662292e-05, | |
| "loss": 1.9574, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.871313672922252, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 6.579798566743314e-05, | |
| "loss": 2.023, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8746648793565683, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 6.497926187405326e-05, | |
| "loss": 1.9266, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.8780160857908847, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 6.416320504546997e-05, | |
| "loss": 1.9451, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.881367292225201, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 6.334987732757029e-05, | |
| "loss": 2.0478, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.8847184986595175, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 6.25393406584088e-05, | |
| "loss": 2.0049, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8880697050938338, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 6.173165676349103e-05, | |
| "loss": 1.9374, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.8914209115281502, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 6.092688715107264e-05, | |
| "loss": 1.9352, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8947721179624665, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 6.012509310747538e-05, | |
| "loss": 1.8953, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.8981233243967829, | |
| "grad_norm": 3.25, | |
| "learning_rate": 5.9326335692419995e-05, | |
| "loss": 2.0176, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.9014745308310992, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 5.853067573437612e-05, | |
| "loss": 1.9532, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.9048257372654156, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.773817382593008e-05, | |
| "loss": 1.9968, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.9081769436997319, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 5.694889031917047e-05, | |
| "loss": 2.0223, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.9115281501340483, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 5.616288532109225e-05, | |
| "loss": 1.8986, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.9148793565683646, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 5.5380218689019125e-05, | |
| "loss": 1.8947, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.9182305630026809, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 5.4600950026045326e-05, | |
| "loss": 2.0307, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.9215817694369973, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.3825138676496624e-05, | |
| "loss": 1.8391, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.9249329758713136, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 5.305284372141095e-05, | |
| "loss": 1.9281, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.92828418230563, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 5.2284123974039154e-05, | |
| "loss": 2.0512, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.9316353887399463, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 5.15190379753663e-05, | |
| "loss": 1.9567, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9349865951742627, | |
| "grad_norm": 3.125, | |
| "learning_rate": 5.07576439896533e-05, | |
| "loss": 1.864, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.938337801608579, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 5.000000000000002e-05, | |
| "loss": 1.9016, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9416890080428955, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 4.924616370392961e-05, | |
| "loss": 2.0128, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.9450402144772118, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.8496192508994576e-05, | |
| "loss": 2.0419, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9483914209115282, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.7750143528405126e-05, | |
| "loss": 1.964, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.9517426273458445, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.700807357667952e-05, | |
| "loss": 1.9684, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9550938337801609, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 4.6270039165317605e-05, | |
| "loss": 1.9713, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.9584450402144772, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 4.553609649849728e-05, | |
| "loss": 1.8728, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9617962466487936, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 4.480630146879419e-05, | |
| "loss": 1.8794, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.9651474530831099, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 4.4080709652925336e-05, | |
| "loss": 2.017, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.9684986595174263, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 4.335937630751674e-05, | |
| "loss": 1.9458, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.9718498659517426, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 4.264235636489542e-05, | |
| "loss": 2.0505, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.975201072386059, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 4.1929704428906026e-05, | |
| "loss": 1.9101, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.9785522788203753, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 4.12214747707527e-05, | |
| "loss": 1.8192, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9819034852546917, | |
| "grad_norm": 3.0, | |
| "learning_rate": 4.0517721324865884e-05, | |
| "loss": 1.9514, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.985254691689008, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 3.981849768479517e-05, | |
| "loss": 1.9101, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9886058981233244, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.9123857099127936e-05, | |
| "loss": 1.8583, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.9919571045576407, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 3.843385246743417e-05, | |
| "loss": 1.8693, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9953083109919572, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.774853633623806e-05, | |
| "loss": 1.952, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.9986595174262735, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 3.7067960895016275e-05, | |
| "loss": 1.9502, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.0020107238605898, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 3.6392177972223594e-05, | |
| "loss": 1.7212, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.0053619302949062, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 3.5721239031346066e-05, | |
| "loss": 1.6225, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.0053619302949062, | |
| "eval_128_ap": 0.7931236608249025, | |
| "eval_128_auc": 0.9804633885794023, | |
| "eval_128_loss": 1.6620492935180664, | |
| "eval_128_runtime": 20.588, | |
| "eval_128_samples_per_second": 19.429, | |
| "eval_128_steps_per_second": 4.857, | |
| "step": 1500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4517613999095808.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |