diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,43895 +10,43895 @@ "log_history": [ { "epoch": 0.00013679890560875513, - "grad_norm": 18.780212578470213, + "grad_norm": 64.35742316131605, "learning_rate": 9.999999538252033e-06, - "loss": 0.6737, + "loss": 0.6891, "step": 1 }, { "epoch": 0.00027359781121751026, - "grad_norm": 4.933946740734121, + "grad_norm": 88.77215376741684, "learning_rate": 9.999998153008213e-06, - "loss": 0.3965, + "loss": 0.5571, "step": 2 }, { "epoch": 0.0004103967168262654, - "grad_norm": 5.5108243551274185, + "grad_norm": 8.498605817949812, "learning_rate": 9.999995844268795e-06, - "loss": 0.3706, + "loss": 0.4738, "step": 3 }, { "epoch": 0.0005471956224350205, - "grad_norm": 4.643449646241658, + "grad_norm": 7.291242273042836, "learning_rate": 9.999992612034209e-06, - "loss": 0.3731, + "loss": 0.4904, "step": 4 }, { "epoch": 0.0006839945280437756, - "grad_norm": 4.652258987833469, + "grad_norm": 13.581706569776655, "learning_rate": 9.999988456305051e-06, - "loss": 0.3299, + "loss": 0.4409, "step": 5 }, { "epoch": 0.0008207934336525308, - "grad_norm": 4.298444297225067, + "grad_norm": 6.378645347846411, "learning_rate": 9.999983377082088e-06, - "loss": 0.3335, + "loss": 0.4547, "step": 6 }, { "epoch": 0.0009575923392612859, - "grad_norm": 4.023973160137789, + "grad_norm": 5.517258232607703, "learning_rate": 9.999977374366256e-06, - "loss": 0.3193, + "loss": 0.4234, "step": 7 }, { "epoch": 0.001094391244870041, - "grad_norm": 2.9046086426483764, + "grad_norm": 4.773385939983008, "learning_rate": 9.999970448158668e-06, - "loss": 0.2518, + "loss": 0.3526, "step": 8 }, { "epoch": 0.0012311901504787962, - "grad_norm": 3.55084799861329, + "grad_norm": 8.028344366318544, "learning_rate": 9.9999625984606e-06, - "loss": 0.2917, + "loss": 0.3963, "step": 9 }, { "epoch": 0.0013679890560875513, - "grad_norm": 3.628908898238197, + "grad_norm": 6.284350100208066, "learning_rate": 9.999953825273503e-06, - "loss": 0.3446, + "loss": 0.4534, "step": 10 }, { "epoch": 0.0015047879616963064, - "grad_norm": 4.066285846170224, + "grad_norm": 5.874251355125282, "learning_rate": 9.999944128599e-06, - "loss": 0.3262, + "loss": 0.4316, "step": 11 }, { "epoch": 0.0016415868673050615, - "grad_norm": 2.9719471603789445, + "grad_norm": 4.931046637113994, "learning_rate": 9.999933508438875e-06, - "loss": 0.2772, + "loss": 0.3715, "step": 12 }, { "epoch": 0.0017783857729138167, - "grad_norm": 3.0897077334808762, + "grad_norm": 5.258852061937538, "learning_rate": 9.999921964795096e-06, - "loss": 0.2727, + "loss": 0.3587, "step": 13 }, { "epoch": 0.0019151846785225718, - "grad_norm": 3.524005650255922, + "grad_norm": 5.346063832005752, "learning_rate": 9.999909497669794e-06, - "loss": 0.267, + "loss": 0.361, "step": 14 }, { "epoch": 0.002051983584131327, - "grad_norm": 3.8840911577628945, + "grad_norm": 6.526930618798398, "learning_rate": 9.999896107065267e-06, - "loss": 0.296, + "loss": 0.3896, "step": 15 }, { "epoch": 0.002188782489740082, - "grad_norm": 4.028856594363156, + "grad_norm": 5.955961418616586, "learning_rate": 9.999881792983995e-06, - "loss": 0.3648, + "loss": 0.4606, "step": 16 }, { "epoch": 0.002325581395348837, - "grad_norm": 3.0285722721402686, + "grad_norm": 5.7223844661765675, "learning_rate": 9.999866555428619e-06, - "loss": 0.2831, + "loss": 0.3622, "step": 17 }, { "epoch": 0.0024623803009575923, - "grad_norm": 4.181472308438933, + "grad_norm": 5.752294179631819, "learning_rate": 9.999850394401951e-06, - "loss": 0.2974, + "loss": 0.3724, "step": 18 }, { "epoch": 0.0025991792065663474, - "grad_norm": 3.5683510440106434, + "grad_norm": 5.373142104381707, "learning_rate": 9.999833309906979e-06, - "loss": 0.303, + "loss": 0.3781, "step": 19 }, { "epoch": 0.0027359781121751026, - "grad_norm": 3.697074117372759, + "grad_norm": 5.5727961137514415, "learning_rate": 9.999815301946856e-06, - "loss": 0.3352, + "loss": 0.3996, "step": 20 }, { "epoch": 0.0028727770177838577, - "grad_norm": 3.2589563697387693, + "grad_norm": 5.179949693019264, "learning_rate": 9.99979637052491e-06, - "loss": 0.2573, + "loss": 0.3188, "step": 21 }, { "epoch": 0.003009575923392613, - "grad_norm": 3.574257895829901, + "grad_norm": 5.209059955125869, "learning_rate": 9.999776515644639e-06, - "loss": 0.2608, + "loss": 0.3228, "step": 22 }, { "epoch": 0.003146374829001368, - "grad_norm": 3.282417785526341, + "grad_norm": 5.38045909736623, "learning_rate": 9.999755737309706e-06, - "loss": 0.2782, + "loss": 0.345, "step": 23 }, { "epoch": 0.003283173734610123, - "grad_norm": 4.080733381738784, + "grad_norm": 5.081438621393957, "learning_rate": 9.999734035523952e-06, - "loss": 0.3108, + "loss": 0.3642, "step": 24 }, { "epoch": 0.003419972640218878, - "grad_norm": 3.6881822027070394, + "grad_norm": 5.400180585001365, "learning_rate": 9.999711410291384e-06, - "loss": 0.2703, + "loss": 0.3252, "step": 25 }, { "epoch": 0.0035567715458276333, - "grad_norm": 3.4432723625423605, + "grad_norm": 5.082854677058324, "learning_rate": 9.999687861616181e-06, - "loss": 0.2884, + "loss": 0.3343, "step": 26 }, { "epoch": 0.0036935704514363885, - "grad_norm": 3.4309485316484345, + "grad_norm": 4.817076949610238, "learning_rate": 9.999663389502692e-06, - "loss": 0.2658, + "loss": 0.3137, "step": 27 }, { "epoch": 0.0038303693570451436, - "grad_norm": 3.6190566499460606, + "grad_norm": 5.0649527639104495, "learning_rate": 9.99963799395544e-06, - "loss": 0.319, + "loss": 0.3631, "step": 28 }, { "epoch": 0.003967168262653899, - "grad_norm": 3.1335088227376318, + "grad_norm": 29.384301278708318, "learning_rate": 9.99961167497911e-06, - "loss": 0.2561, + "loss": 0.295, "step": 29 }, { "epoch": 0.004103967168262654, - "grad_norm": 3.33913495349125, + "grad_norm": 4.892867438218258, "learning_rate": 9.999584432578569e-06, - "loss": 0.2803, + "loss": 0.3201, "step": 30 }, { "epoch": 0.004240766073871409, - "grad_norm": 2.3787095793836697, + "grad_norm": 3.832429044151258, "learning_rate": 9.999556266758844e-06, - "loss": 0.2253, + "loss": 0.255, "step": 31 }, { "epoch": 0.004377564979480164, - "grad_norm": 2.986031892312619, + "grad_norm": 3.935417149237897, "learning_rate": 9.99952717752514e-06, - "loss": 0.3019, + "loss": 0.3248, "step": 32 }, { "epoch": 0.004514363885088919, - "grad_norm": 2.828786893794623, + "grad_norm": 3.7148509578161906, "learning_rate": 9.999497164882827e-06, - "loss": 0.2936, + "loss": 0.3181, "step": 33 }, { "epoch": 0.004651162790697674, - "grad_norm": 3.557536307953988, + "grad_norm": 4.449727584229228, "learning_rate": 9.999466228837452e-06, - "loss": 0.2734, + "loss": 0.2896, "step": 34 }, { "epoch": 0.0047879616963064295, - "grad_norm": 3.6223881347266826, + "grad_norm": 4.200154006588455, "learning_rate": 9.999434369394725e-06, - "loss": 0.3275, + "loss": 0.3472, "step": 35 }, { "epoch": 0.004924760601915185, - "grad_norm": 3.402735729067309, + "grad_norm": 3.8301689195978317, "learning_rate": 9.999401586560535e-06, - "loss": 0.304, + "loss": 0.3219, "step": 36 }, { "epoch": 0.00506155950752394, - "grad_norm": 3.275865910684739, + "grad_norm": 3.630731103435099, "learning_rate": 9.999367880340931e-06, - "loss": 0.2679, + "loss": 0.2834, "step": 37 }, { "epoch": 0.005198358413132695, - "grad_norm": 3.431840163088559, + "grad_norm": 3.6207605934832787, "learning_rate": 9.999333250742145e-06, - "loss": 0.3324, + "loss": 0.3444, "step": 38 }, { "epoch": 0.00533515731874145, - "grad_norm": 2.9230252101954863, + "grad_norm": 3.013310698691736, "learning_rate": 9.99929769777057e-06, - "loss": 0.2784, + "loss": 0.2864, "step": 39 }, { "epoch": 0.005471956224350205, - "grad_norm": 2.6850108401796327, + "grad_norm": 2.749467990631861, "learning_rate": 9.99926122143277e-06, - "loss": 0.2439, + "loss": 0.2482, "step": 40 }, { "epoch": 0.00560875512995896, - "grad_norm": 2.96945153007254, + "grad_norm": 3.2276388427934215, "learning_rate": 9.999223821735487e-06, - "loss": 0.2299, + "loss": 0.2393, "step": 41 }, { "epoch": 0.005745554035567715, - "grad_norm": 2.5769807478805635, + "grad_norm": 2.712567919745759, "learning_rate": 9.999185498685624e-06, - "loss": 0.2487, + "loss": 0.2561, "step": 42 }, { "epoch": 0.0058823529411764705, - "grad_norm": 2.5260839314011934, + "grad_norm": 2.6085970545944344, "learning_rate": 9.999146252290264e-06, - "loss": 0.2272, + "loss": 0.2381, "step": 43 }, { "epoch": 0.006019151846785226, - "grad_norm": 2.6289439879306222, + "grad_norm": 2.5809383989704475, "learning_rate": 9.999106082556653e-06, - "loss": 0.211, + "loss": 0.2151, "step": 44 }, { "epoch": 0.006155950752393981, - "grad_norm": 2.8137367424014283, + "grad_norm": 2.770812965740556, "learning_rate": 9.99906498949221e-06, - "loss": 0.2154, + "loss": 0.2238, "step": 45 }, { "epoch": 0.006292749658002736, - "grad_norm": 2.9859209449716517, + "grad_norm": 2.8917308896511864, "learning_rate": 9.999022973104525e-06, - "loss": 0.317, + "loss": 0.3172, "step": 46 }, { "epoch": 0.006429548563611491, - "grad_norm": 3.063876351134364, + "grad_norm": 2.9932028517451457, "learning_rate": 9.998980033401359e-06, - "loss": 0.3397, + "loss": 0.3451, "step": 47 }, { "epoch": 0.006566347469220246, - "grad_norm": 2.5093122362904854, + "grad_norm": 2.436865913801942, "learning_rate": 9.998936170390645e-06, - "loss": 0.2484, + "loss": 0.2601, "step": 48 }, { "epoch": 0.006703146374829001, - "grad_norm": 2.834326228346881, + "grad_norm": 2.8130309035213, "learning_rate": 9.99889138408048e-06, - "loss": 0.249, + "loss": 0.2586, "step": 49 }, { "epoch": 0.006839945280437756, - "grad_norm": 2.9941640890123935, + "grad_norm": 3.039677309928192, "learning_rate": 9.998845674479139e-06, - "loss": 0.2222, + "loss": 0.2228, "step": 50 }, { "epoch": 0.0069767441860465115, - "grad_norm": 3.0881660114785987, + "grad_norm": 3.1282021263572313, "learning_rate": 9.998799041595064e-06, - "loss": 0.2786, + "loss": 0.2841, "step": 51 }, { "epoch": 0.007113543091655267, - "grad_norm": 2.8190744813330464, + "grad_norm": 2.9870287434228557, "learning_rate": 9.998751485436868e-06, - "loss": 0.2555, + "loss": 0.2604, "step": 52 }, { "epoch": 0.007250341997264022, - "grad_norm": 3.470019851896128, + "grad_norm": 3.4768504590502856, "learning_rate": 9.998703006013335e-06, - "loss": 0.2795, + "loss": 0.2842, "step": 53 }, { "epoch": 0.007387140902872777, - "grad_norm": 3.177677483624265, + "grad_norm": 3.118494485511974, "learning_rate": 9.998653603333418e-06, - "loss": 0.3732, + "loss": 0.3816, "step": 54 }, { "epoch": 0.007523939808481532, - "grad_norm": 3.1746329822097406, + "grad_norm": 3.1479203454149167, "learning_rate": 9.998603277406242e-06, - "loss": 0.2694, + "loss": 0.2783, "step": 55 }, { "epoch": 0.007660738714090287, - "grad_norm": 2.8931725193285174, + "grad_norm": 2.8503371087150993, "learning_rate": 9.998552028241103e-06, - "loss": 0.2513, + "loss": 0.2603, "step": 56 }, { "epoch": 0.007797537619699042, - "grad_norm": 1.7585746526348025, + "grad_norm": 1.8816672493388364, "learning_rate": 9.998499855847465e-06, - "loss": 0.1796, + "loss": 0.1912, "step": 57 }, { "epoch": 0.007934336525307797, - "grad_norm": 3.014556969451939, + "grad_norm": 3.174418717177392, "learning_rate": 9.998446760234967e-06, - "loss": 0.2671, + "loss": 0.2837, "step": 58 }, { "epoch": 0.008071135430916553, - "grad_norm": 3.000812082388844, + "grad_norm": 2.878837042023672, "learning_rate": 9.998392741413413e-06, - "loss": 0.2369, + "loss": 0.2421, "step": 59 }, { "epoch": 0.008207934336525308, - "grad_norm": 2.692662753961972, + "grad_norm": 2.7556109495818983, "learning_rate": 9.99833779939278e-06, - "loss": 0.2447, + "loss": 0.2512, "step": 60 }, { "epoch": 0.008344733242134063, - "grad_norm": 3.0077560741793543, + "grad_norm": 3.0246534795087787, "learning_rate": 9.998281934183219e-06, - "loss": 0.2837, + "loss": 0.2943, "step": 61 }, { "epoch": 0.008481532147742818, - "grad_norm": 2.7558796751883756, + "grad_norm": 2.776970825479079, "learning_rate": 9.998225145795047e-06, - "loss": 0.2331, + "loss": 0.2428, "step": 62 }, { "epoch": 0.008618331053351573, - "grad_norm": 2.4619844966866578, + "grad_norm": 2.477522943925521, "learning_rate": 9.998167434238749e-06, - "loss": 0.2483, + "loss": 0.2526, "step": 63 }, { "epoch": 0.008755129958960328, - "grad_norm": 2.583857917339382, + "grad_norm": 2.5253286315873558, "learning_rate": 9.998108799524989e-06, - "loss": 0.2652, + "loss": 0.2697, "step": 64 }, { "epoch": 0.008891928864569083, - "grad_norm": 3.0491347206808976, + "grad_norm": 2.975327668938459, "learning_rate": 9.998049241664596e-06, - "loss": 0.296, + "loss": 0.309, "step": 65 }, { "epoch": 0.009028727770177838, - "grad_norm": 2.5984784812301083, + "grad_norm": 2.5987039882705174, "learning_rate": 9.997988760668567e-06, - "loss": 0.2654, + "loss": 0.2812, "step": 66 }, { "epoch": 0.009165526675786594, - "grad_norm": 2.4765792134290714, + "grad_norm": 2.5275555865483232, "learning_rate": 9.997927356548075e-06, - "loss": 0.2569, + "loss": 0.2743, "step": 67 }, { "epoch": 0.009302325581395349, - "grad_norm": 3.063362496249664, + "grad_norm": 3.0651997746839013, "learning_rate": 9.997865029314464e-06, - "loss": 0.3008, + "loss": 0.3171, "step": 68 }, { "epoch": 0.009439124487004104, - "grad_norm": 2.1969243157989995, + "grad_norm": 2.0942553450903834, "learning_rate": 9.997801778979243e-06, - "loss": 0.2433, + "loss": 0.2497, "step": 69 }, { "epoch": 0.009575923392612859, - "grad_norm": 2.8660661480556904, + "grad_norm": 2.789162255966833, "learning_rate": 9.997737605554092e-06, - "loss": 0.2769, + "loss": 0.2758, "step": 70 }, { "epoch": 0.009712722298221614, - "grad_norm": 2.925357872096915, + "grad_norm": 2.716554961793272, "learning_rate": 9.997672509050868e-06, - "loss": 0.2618, + "loss": 0.2712, "step": 71 }, { "epoch": 0.00984952120383037, - "grad_norm": 2.8123874928569017, + "grad_norm": 2.7497020204203557, "learning_rate": 9.997606489481592e-06, - "loss": 0.2756, + "loss": 0.2812, "step": 72 }, { "epoch": 0.009986320109439124, - "grad_norm": 2.663423989591573, + "grad_norm": 2.385119295280193, "learning_rate": 9.997539546858459e-06, - "loss": 0.2577, + "loss": 0.2578, "step": 73 }, { "epoch": 0.01012311901504788, - "grad_norm": 1.7648875478036978, + "grad_norm": 1.76680408983791, "learning_rate": 9.997471681193833e-06, - "loss": 0.191, + "loss": 0.1964, "step": 74 }, { "epoch": 0.010259917920656635, - "grad_norm": 2.5875904662534146, + "grad_norm": 2.4594219472267493, "learning_rate": 9.997402892500247e-06, - "loss": 0.1966, + "loss": 0.2028, "step": 75 }, { "epoch": 0.01039671682626539, - "grad_norm": 3.2406879231891437, + "grad_norm": 3.1884819306350654, "learning_rate": 9.997333180790408e-06, - "loss": 0.3214, + "loss": 0.3253, "step": 76 }, { "epoch": 0.010533515731874145, - "grad_norm": 2.8911348394266185, + "grad_norm": 2.8379113346693856, "learning_rate": 9.99726254607719e-06, - "loss": 0.2841, + "loss": 0.2966, "step": 77 }, { "epoch": 0.0106703146374829, - "grad_norm": 3.094401054828441, + "grad_norm": 2.9944641019059937, "learning_rate": 9.997190988373645e-06, - "loss": 0.263, + "loss": 0.264, "step": 78 }, { "epoch": 0.010807113543091655, - "grad_norm": 2.4198612619092428, + "grad_norm": 2.351233833621617, "learning_rate": 9.99711850769298e-06, - "loss": 0.2447, + "loss": 0.252, "step": 79 }, { "epoch": 0.01094391244870041, - "grad_norm": 2.7811410563434027, + "grad_norm": 2.5926666701851, "learning_rate": 9.997045104048588e-06, - "loss": 0.2738, + "loss": 0.278, "step": 80 }, { "epoch": 0.011080711354309165, - "grad_norm": 2.6952461156542302, + "grad_norm": 2.591665767974439, "learning_rate": 9.996970777454028e-06, - "loss": 0.2412, + "loss": 0.2462, "step": 81 }, { "epoch": 0.01121751025991792, - "grad_norm": 2.4272227929148187, + "grad_norm": 2.317971636984408, "learning_rate": 9.996895527923024e-06, - "loss": 0.2499, + "loss": 0.2522, "step": 82 }, { "epoch": 0.011354309165526676, - "grad_norm": 2.8838153597570777, + "grad_norm": 2.7901616085163883, "learning_rate": 9.996819355469476e-06, - "loss": 0.2702, + "loss": 0.2788, "step": 83 }, { "epoch": 0.01149110807113543, - "grad_norm": 2.2104674149589787, + "grad_norm": 2.129472340498544, "learning_rate": 9.996742260107455e-06, - "loss": 0.236, + "loss": 0.2398, "step": 84 }, { "epoch": 0.011627906976744186, - "grad_norm": 2.2803114089982515, + "grad_norm": 2.2607653320610166, "learning_rate": 9.996664241851197e-06, - "loss": 0.2397, + "loss": 0.2447, "step": 85 }, { "epoch": 0.011764705882352941, - "grad_norm": 2.9244440957851863, + "grad_norm": 2.7944118145394197, "learning_rate": 9.996585300715117e-06, - "loss": 0.2716, + "loss": 0.2776, "step": 86 }, { "epoch": 0.011901504787961696, - "grad_norm": 2.600131455796667, + "grad_norm": 2.5842702141539724, "learning_rate": 9.996505436713788e-06, - "loss": 0.2932, + "loss": 0.3038, "step": 87 }, { "epoch": 0.012038303693570451, - "grad_norm": 2.6793186235855235, + "grad_norm": 2.5189803392621886, "learning_rate": 9.996424649861967e-06, - "loss": 0.2875, + "loss": 0.2933, "step": 88 }, { "epoch": 0.012175102599179206, - "grad_norm": 2.209443069840069, + "grad_norm": 2.128400563859252, "learning_rate": 9.996342940174573e-06, - "loss": 0.2442, + "loss": 0.2471, "step": 89 }, { "epoch": 0.012311901504787962, - "grad_norm": 2.761614976891791, + "grad_norm": 2.6738648172673742, "learning_rate": 9.996260307666697e-06, - "loss": 0.2801, + "loss": 0.2846, "step": 90 }, { "epoch": 0.012448700410396717, - "grad_norm": 2.4124652873177523, + "grad_norm": 2.321924507116477, "learning_rate": 9.996176752353602e-06, - "loss": 0.2654, + "loss": 0.2665, "step": 91 }, { "epoch": 0.012585499316005472, - "grad_norm": 2.412813618937782, + "grad_norm": 2.4254962976673395, "learning_rate": 9.996092274250722e-06, - "loss": 0.2093, + "loss": 0.2118, "step": 92 }, { "epoch": 0.012722298221614227, - "grad_norm": 2.608548104179741, + "grad_norm": 2.526341879206316, "learning_rate": 9.996006873373656e-06, - "loss": 0.267, + "loss": 0.2668, "step": 93 }, { "epoch": 0.012859097127222982, - "grad_norm": 2.7944954337209458, + "grad_norm": 2.655770204879157, "learning_rate": 9.995920549738183e-06, - "loss": 0.2396, + "loss": 0.2426, "step": 94 }, { "epoch": 0.012995896032831737, - "grad_norm": 2.3192263599972867, + "grad_norm": 2.156565434796261, "learning_rate": 9.995833303360243e-06, - "loss": 0.252, + "loss": 0.2541, "step": 95 }, { "epoch": 0.013132694938440492, - "grad_norm": 1.8404250429186053, + "grad_norm": 1.8111398880211096, "learning_rate": 9.995745134255952e-06, - "loss": 0.1854, + "loss": 0.1866, "step": 96 }, { "epoch": 0.013269493844049247, - "grad_norm": 2.500791735940774, + "grad_norm": 2.5365253306448046, "learning_rate": 9.995656042441593e-06, - "loss": 0.2862, + "loss": 0.2945, "step": 97 }, { "epoch": 0.013406292749658003, - "grad_norm": 2.3227773143427166, + "grad_norm": 2.3036572942531484, "learning_rate": 9.995566027933622e-06, - "loss": 0.2921, + "loss": 0.3005, "step": 98 }, { "epoch": 0.013543091655266758, - "grad_norm": 2.601008761138818, + "grad_norm": 2.466485295008702, "learning_rate": 9.995475090748665e-06, - "loss": 0.2938, + "loss": 0.2992, "step": 99 }, { "epoch": 0.013679890560875513, - "grad_norm": 2.5864554349527165, + "grad_norm": 2.5181314240427723, "learning_rate": 9.995383230903519e-06, - "loss": 0.2684, + "loss": 0.2769, "step": 100 }, { "epoch": 0.013679890560875513, - "eval_loss": 0.25674256682395935, - "eval_runtime": 5.9445, - "eval_samples_per_second": 5.047, - "eval_steps_per_second": 1.346, + "eval_loss": 0.2640659809112549, + "eval_runtime": 5.9116, + "eval_samples_per_second": 5.075, + "eval_steps_per_second": 1.353, "step": 100 }, { "epoch": 0.013816689466484268, - "grad_norm": 2.2128871250034505, + "grad_norm": 2.339866607979537, "learning_rate": 9.99529044841515e-06, - "loss": 0.2564, + "loss": 0.2657, "step": 101 }, { "epoch": 0.013953488372093023, - "grad_norm": 2.3795677827505792, + "grad_norm": 2.2674468468327156, "learning_rate": 9.995196743300693e-06, - "loss": 0.2599, + "loss": 0.259, "step": 102 }, { "epoch": 0.014090287277701778, - "grad_norm": 2.1860021017045765, + "grad_norm": 2.1127379079956286, "learning_rate": 9.995102115577455e-06, - "loss": 0.2442, + "loss": 0.2474, "step": 103 }, { "epoch": 0.014227086183310533, - "grad_norm": 2.2857991982144994, + "grad_norm": 2.335729114154643, "learning_rate": 9.995006565262917e-06, - "loss": 0.2661, + "loss": 0.2776, "step": 104 }, { "epoch": 0.014363885088919288, - "grad_norm": 1.9780559531341884, + "grad_norm": 1.9797662502405808, "learning_rate": 9.994910092374725e-06, - "loss": 0.2247, + "loss": 0.229, "step": 105 }, { "epoch": 0.014500683994528044, - "grad_norm": 2.8489450323931638, + "grad_norm": 2.7175762732113844, "learning_rate": 9.994812696930699e-06, - "loss": 0.3296, + "loss": 0.3327, "step": 106 }, { "epoch": 0.014637482900136799, - "grad_norm": 2.553942780543018, + "grad_norm": 2.4423270746464056, "learning_rate": 9.994714378948825e-06, - "loss": 0.2369, + "loss": 0.24, "step": 107 }, { "epoch": 0.014774281805745554, - "grad_norm": 2.194518249952541, + "grad_norm": 2.146877789171703, "learning_rate": 9.994615138447263e-06, - "loss": 0.2508, + "loss": 0.2517, "step": 108 }, { "epoch": 0.014911080711354309, - "grad_norm": 2.263114131990096, + "grad_norm": 2.103199050216466, "learning_rate": 9.994514975444345e-06, - "loss": 0.2398, + "loss": 0.2383, "step": 109 }, { "epoch": 0.015047879616963064, - "grad_norm": 2.6753935605275627, + "grad_norm": 2.597800578709196, "learning_rate": 9.994413889958569e-06, - "loss": 0.2799, + "loss": 0.2924, "step": 110 }, { "epoch": 0.01518467852257182, - "grad_norm": 2.1760356556673663, + "grad_norm": 2.131593456604441, "learning_rate": 9.994311882008605e-06, - "loss": 0.2533, + "loss": 0.2597, "step": 111 }, { "epoch": 0.015321477428180574, - "grad_norm": 1.9277309224127335, + "grad_norm": 1.7877002307429302, "learning_rate": 9.994208951613295e-06, - "loss": 0.2061, + "loss": 0.2096, "step": 112 }, { "epoch": 0.01545827633378933, - "grad_norm": 2.5740167167316543, + "grad_norm": 2.4595048386766747, "learning_rate": 9.994105098791652e-06, - "loss": 0.2195, + "loss": 0.2258, "step": 113 }, { "epoch": 0.015595075239398085, - "grad_norm": 2.5694570140756747, + "grad_norm": 2.461836298854796, "learning_rate": 9.994000323562852e-06, - "loss": 0.2824, + "loss": 0.2789, "step": 114 }, { "epoch": 0.01573187414500684, - "grad_norm": 1.7040112014574713, + "grad_norm": 1.7826378478160474, "learning_rate": 9.993894625946251e-06, - "loss": 0.2025, + "loss": 0.2037, "step": 115 }, { "epoch": 0.015868673050615595, - "grad_norm": 2.3938123882830173, + "grad_norm": 2.3428436563488466, "learning_rate": 9.993788005961372e-06, - "loss": 0.245, + "loss": 0.2461, "step": 116 }, { "epoch": 0.01600547195622435, - "grad_norm": 2.5362527548983738, + "grad_norm": 2.407356033663139, "learning_rate": 9.993680463627906e-06, - "loss": 0.2779, + "loss": 0.2751, "step": 117 }, { "epoch": 0.016142270861833105, - "grad_norm": 2.525641416622258, + "grad_norm": 2.4822153729619734, "learning_rate": 9.993571998965714e-06, - "loss": 0.2406, + "loss": 0.2379, "step": 118 }, { "epoch": 0.01627906976744186, - "grad_norm": 2.7848726885580177, + "grad_norm": 2.548831749534978, "learning_rate": 9.993462611994833e-06, - "loss": 0.2777, + "loss": 0.2852, "step": 119 }, { "epoch": 0.016415868673050615, - "grad_norm": 2.5039281569459915, + "grad_norm": 2.423799772642147, "learning_rate": 9.993352302735466e-06, - "loss": 0.259, + "loss": 0.2674, "step": 120 }, { "epoch": 0.01655266757865937, - "grad_norm": 2.750017338403424, + "grad_norm": 2.7031467849768176, "learning_rate": 9.993241071207985e-06, - "loss": 0.2592, + "loss": 0.2604, "step": 121 }, { "epoch": 0.016689466484268126, - "grad_norm": 2.46254934551054, + "grad_norm": 2.4381450310697623, "learning_rate": 9.993128917432934e-06, - "loss": 0.2517, + "loss": 0.2529, "step": 122 }, { "epoch": 0.01682626538987688, - "grad_norm": 2.442341481630629, + "grad_norm": 2.450240559097444, "learning_rate": 9.993015841431032e-06, - "loss": 0.2721, + "loss": 0.2726, "step": 123 }, { "epoch": 0.016963064295485636, - "grad_norm": 2.341904939659719, + "grad_norm": 2.256239808986202, "learning_rate": 9.99290184322316e-06, - "loss": 0.2665, + "loss": 0.2709, "step": 124 }, { "epoch": 0.01709986320109439, - "grad_norm": 2.640905210869918, + "grad_norm": 2.559263879520779, "learning_rate": 9.992786922830375e-06, - "loss": 0.2427, + "loss": 0.2474, "step": 125 }, { "epoch": 0.017236662106703146, - "grad_norm": 2.529069209902686, + "grad_norm": 2.324505593806553, "learning_rate": 9.992671080273904e-06, - "loss": 0.2584, + "loss": 0.2613, "step": 126 }, { "epoch": 0.0173734610123119, - "grad_norm": 2.423618942838277, + "grad_norm": 2.336341718434703, "learning_rate": 9.992554315575137e-06, - "loss": 0.2378, + "loss": 0.2443, "step": 127 }, { "epoch": 0.017510259917920656, - "grad_norm": 2.062624934617024, + "grad_norm": 1.9917515053453865, "learning_rate": 9.992436628755648e-06, - "loss": 0.2332, + "loss": 0.2393, "step": 128 }, { "epoch": 0.01764705882352941, - "grad_norm": 2.723432902554609, + "grad_norm": 2.7047167668541165, "learning_rate": 9.992318019837171e-06, - "loss": 0.2683, + "loss": 0.2747, "step": 129 }, { "epoch": 0.017783857729138167, - "grad_norm": 2.086766174109336, + "grad_norm": 2.12993314494685, "learning_rate": 9.992198488841611e-06, - "loss": 0.2253, + "loss": 0.2304, "step": 130 }, { "epoch": 0.017920656634746922, - "grad_norm": 2.43774248402695, + "grad_norm": 2.510770339046147, "learning_rate": 9.992078035791047e-06, - "loss": 0.2175, + "loss": 0.2166, "step": 131 }, { "epoch": 0.018057455540355677, - "grad_norm": 2.476739813642867, + "grad_norm": 2.5340936311587132, "learning_rate": 9.991956660707725e-06, - "loss": 0.215, + "loss": 0.2203, "step": 132 }, { "epoch": 0.018194254445964432, - "grad_norm": 2.4418357317812727, + "grad_norm": 2.3673719101335893, "learning_rate": 9.991834363614066e-06, - "loss": 0.2358, + "loss": 0.2367, "step": 133 }, { "epoch": 0.018331053351573187, - "grad_norm": 2.031281947288201, + "grad_norm": 1.9217422633221752, "learning_rate": 9.991711144532655e-06, - "loss": 0.2276, + "loss": 0.2316, "step": 134 }, { "epoch": 0.018467852257181942, - "grad_norm": 2.830079275443484, + "grad_norm": 2.7429433374382843, "learning_rate": 9.991587003486251e-06, - "loss": 0.2666, + "loss": 0.2748, "step": 135 }, { "epoch": 0.018604651162790697, - "grad_norm": 2.661439973292819, + "grad_norm": 2.6763820635119755, "learning_rate": 9.991461940497786e-06, - "loss": 0.2508, + "loss": 0.2633, "step": 136 }, { "epoch": 0.018741450068399453, - "grad_norm": 2.4886362718590322, + "grad_norm": 2.4610420252326324, "learning_rate": 9.991335955590356e-06, - "loss": 0.2165, + "loss": 0.2296, "step": 137 }, { "epoch": 0.018878248974008208, - "grad_norm": 2.901653077292871, + "grad_norm": 2.920708176257565, "learning_rate": 9.991209048787229e-06, - "loss": 0.3212, + "loss": 0.3242, "step": 138 }, { "epoch": 0.019015047879616963, - "grad_norm": 2.091473880161094, + "grad_norm": 2.0740544852658074, "learning_rate": 9.991081220111846e-06, - "loss": 0.2182, + "loss": 0.2251, "step": 139 }, { "epoch": 0.019151846785225718, - "grad_norm": 2.6371048342459433, + "grad_norm": 2.664419263671648, "learning_rate": 9.99095246958782e-06, - "loss": 0.2433, + "loss": 0.2463, "step": 140 }, { "epoch": 0.019288645690834473, - "grad_norm": 1.8795046933455046, + "grad_norm": 1.7893710474262998, "learning_rate": 9.990822797238927e-06, - "loss": 0.199, + "loss": 0.2002, "step": 141 }, { "epoch": 0.019425444596443228, - "grad_norm": 2.156683660062134, + "grad_norm": 2.1130976045729235, "learning_rate": 9.990692203089118e-06, - "loss": 0.2499, + "loss": 0.2521, "step": 142 }, { "epoch": 0.019562243502051983, - "grad_norm": 1.5649348584514025, + "grad_norm": 1.531469266197404, "learning_rate": 9.990560687162517e-06, - "loss": 0.1694, + "loss": 0.1698, "step": 143 }, { "epoch": 0.01969904240766074, - "grad_norm": 2.3423482590245572, + "grad_norm": 2.2545335712583543, "learning_rate": 9.99042824948341e-06, - "loss": 0.2185, + "loss": 0.2201, "step": 144 }, { "epoch": 0.019835841313269494, - "grad_norm": 2.210734963878387, + "grad_norm": 2.108410949339787, "learning_rate": 9.990294890076262e-06, - "loss": 0.2154, + "loss": 0.211, "step": 145 }, { "epoch": 0.01997264021887825, - "grad_norm": 2.178594584448071, + "grad_norm": 2.066704266129631, "learning_rate": 9.9901606089657e-06, - "loss": 0.2099, + "loss": 0.2134, "step": 146 }, { "epoch": 0.020109439124487004, - "grad_norm": 2.5164217171170558, + "grad_norm": 2.529409769254975, "learning_rate": 9.990025406176531e-06, - "loss": 0.2541, + "loss": 0.2618, "step": 147 }, { "epoch": 0.02024623803009576, - "grad_norm": 2.1196515484327216, + "grad_norm": 2.0494086537006035, "learning_rate": 9.989889281733723e-06, - "loss": 0.2353, + "loss": 0.2381, "step": 148 }, { "epoch": 0.020383036935704514, - "grad_norm": 2.6421723099711927, + "grad_norm": 2.493730450771067, "learning_rate": 9.98975223566242e-06, - "loss": 0.304, + "loss": 0.3052, "step": 149 }, { "epoch": 0.02051983584131327, - "grad_norm": 2.232690891031541, + "grad_norm": 2.172871011869194, "learning_rate": 9.989614267987933e-06, - "loss": 0.2616, + "loss": 0.2639, "step": 150 }, { "epoch": 0.020656634746922024, - "grad_norm": 2.6083048407133447, + "grad_norm": 2.5100386992047357, "learning_rate": 9.989475378735746e-06, - "loss": 0.265, + "loss": 0.2689, "step": 151 }, { "epoch": 0.02079343365253078, - "grad_norm": 2.2929715024210826, + "grad_norm": 2.32965056444658, "learning_rate": 9.98933556793151e-06, - "loss": 0.2712, + "loss": 0.2792, "step": 152 }, { "epoch": 0.020930232558139535, - "grad_norm": 2.459853860723307, + "grad_norm": 2.4060377593880213, "learning_rate": 9.989194835601048e-06, - "loss": 0.2345, + "loss": 0.2353, "step": 153 }, { "epoch": 0.02106703146374829, - "grad_norm": 2.262871999041409, + "grad_norm": 2.200109939090264, "learning_rate": 9.989053181770355e-06, - "loss": 0.2291, + "loss": 0.2324, "step": 154 }, { "epoch": 0.021203830369357045, - "grad_norm": 2.333519412935425, + "grad_norm": 2.072160730331679, "learning_rate": 9.988910606465594e-06, - "loss": 0.2302, + "loss": 0.2283, "step": 155 }, { "epoch": 0.0213406292749658, - "grad_norm": 2.5027911242598906, + "grad_norm": 2.3768130411323996, "learning_rate": 9.988767109713098e-06, - "loss": 0.2644, + "loss": 0.2633, "step": 156 }, { "epoch": 0.021477428180574555, - "grad_norm": 2.269940653373457, + "grad_norm": 2.2206615115409667, "learning_rate": 9.988622691539368e-06, - "loss": 0.2145, + "loss": 0.2167, "step": 157 }, { "epoch": 0.02161422708618331, - "grad_norm": 3.0314833223381163, + "grad_norm": 2.8183012701653203, "learning_rate": 9.988477351971085e-06, - "loss": 0.2825, + "loss": 0.2746, "step": 158 }, { "epoch": 0.021751025991792065, - "grad_norm": 2.514785738421203, + "grad_norm": 2.443610560456794, "learning_rate": 9.988331091035083e-06, - "loss": 0.3211, + "loss": 0.3266, "step": 159 }, { "epoch": 0.02188782489740082, - "grad_norm": 2.523000070026238, + "grad_norm": 2.52199377552239, "learning_rate": 9.988183908758387e-06, - "loss": 0.3041, + "loss": 0.3086, "step": 160 }, { "epoch": 0.022024623803009576, - "grad_norm": 2.442662258355344, + "grad_norm": 2.8507484009941497, "learning_rate": 9.988035805168173e-06, - "loss": 0.2133, + "loss": 0.2181, "step": 161 }, { "epoch": 0.02216142270861833, - "grad_norm": 2.4502320770182036, + "grad_norm": 2.5113367740347865, "learning_rate": 9.9878867802918e-06, - "loss": 0.2724, + "loss": 0.2749, "step": 162 }, { "epoch": 0.022298221614227086, - "grad_norm": 1.9768310865631893, + "grad_norm": 1.8350013097183633, "learning_rate": 9.987736834156792e-06, - "loss": 0.2035, + "loss": 0.2019, "step": 163 }, { "epoch": 0.02243502051983584, - "grad_norm": 2.275178184706036, + "grad_norm": 2.2356890756504395, "learning_rate": 9.987585966790844e-06, - "loss": 0.2588, + "loss": 0.2619, "step": 164 }, { "epoch": 0.022571819425444596, - "grad_norm": 2.0690074354492913, + "grad_norm": 1.98730420410389, "learning_rate": 9.98743417822182e-06, - "loss": 0.2284, + "loss": 0.2305, "step": 165 }, { "epoch": 0.02270861833105335, - "grad_norm": 2.6844644230011196, + "grad_norm": 2.4773529726808814, "learning_rate": 9.987281468477756e-06, - "loss": 0.2871, + "loss": 0.29, "step": 166 }, { "epoch": 0.022845417236662106, - "grad_norm": 2.19620275949746, + "grad_norm": 2.1285152161100602, "learning_rate": 9.987127837586858e-06, - "loss": 0.2204, + "loss": 0.2253, "step": 167 }, { "epoch": 0.02298221614227086, - "grad_norm": 1.754556611391337, + "grad_norm": 1.638275663352354, "learning_rate": 9.9869732855775e-06, - "loss": 0.1956, + "loss": 0.1957, "step": 168 }, { "epoch": 0.023119015047879617, - "grad_norm": 2.016412738480527, + "grad_norm": 1.951337579323693, "learning_rate": 9.986817812478229e-06, - "loss": 0.2529, + "loss": 0.2589, "step": 169 }, { "epoch": 0.023255813953488372, - "grad_norm": 2.4102613500444714, + "grad_norm": 2.3208323896619754, "learning_rate": 9.986661418317759e-06, - "loss": 0.2875, + "loss": 0.2958, "step": 170 }, { "epoch": 0.023392612859097127, - "grad_norm": 2.0513770356168695, + "grad_norm": 2.032185420374012, "learning_rate": 9.986504103124978e-06, - "loss": 0.2017, + "loss": 0.2077, "step": 171 }, { "epoch": 0.023529411764705882, - "grad_norm": 2.490155396601029, + "grad_norm": 2.4200416339948823, "learning_rate": 9.98634586692894e-06, - "loss": 0.2489, + "loss": 0.2508, "step": 172 }, { "epoch": 0.023666210670314637, - "grad_norm": 2.3360663562498933, + "grad_norm": 2.2998523288449624, "learning_rate": 9.986186709758874e-06, - "loss": 0.2601, + "loss": 0.2641, "step": 173 }, { "epoch": 0.023803009575923392, - "grad_norm": 2.1733421578596177, + "grad_norm": 1.9924776377540276, "learning_rate": 9.986026631644173e-06, - "loss": 0.2386, + "loss": 0.2345, "step": 174 }, { "epoch": 0.023939808481532147, - "grad_norm": 2.259259687019056, + "grad_norm": 2.120415885162381, "learning_rate": 9.985865632614407e-06, - "loss": 0.2656, + "loss": 0.2677, "step": 175 }, { "epoch": 0.024076607387140903, - "grad_norm": 2.293407824841788, + "grad_norm": 2.2045022795076643, "learning_rate": 9.985703712699307e-06, - "loss": 0.2483, + "loss": 0.2491, "step": 176 }, { "epoch": 0.024213406292749658, - "grad_norm": 2.3497507735601464, + "grad_norm": 2.3454504176501687, "learning_rate": 9.985540871928784e-06, - "loss": 0.2311, + "loss": 0.2414, "step": 177 }, { "epoch": 0.024350205198358413, - "grad_norm": 2.1681287623053658, + "grad_norm": 2.0281124030117192, "learning_rate": 9.985377110332912e-06, - "loss": 0.2306, + "loss": 0.2301, "step": 178 }, { "epoch": 0.024487004103967168, - "grad_norm": 1.9071236665518898, + "grad_norm": 1.8008414767222318, "learning_rate": 9.98521242794194e-06, - "loss": 0.2225, + "loss": 0.2232, "step": 179 }, { "epoch": 0.024623803009575923, - "grad_norm": 2.0265535601952434, + "grad_norm": 1.9181180647777205, "learning_rate": 9.985046824786283e-06, - "loss": 0.2203, + "loss": 0.2242, "step": 180 }, { "epoch": 0.024760601915184678, - "grad_norm": 2.3145806269533886, + "grad_norm": 2.18030043152172, "learning_rate": 9.984880300896528e-06, - "loss": 0.2473, + "loss": 0.2464, "step": 181 }, { "epoch": 0.024897400820793433, - "grad_norm": 2.6709451702621436, + "grad_norm": 2.609906424811861, "learning_rate": 9.984712856303432e-06, - "loss": 0.2864, + "loss": 0.2829, "step": 182 }, { "epoch": 0.02503419972640219, - "grad_norm": 2.450515793531054, + "grad_norm": 2.320537752104847, "learning_rate": 9.984544491037921e-06, - "loss": 0.2708, + "loss": 0.2717, "step": 183 }, { "epoch": 0.025170998632010944, - "grad_norm": 2.1939218953970134, + "grad_norm": 1.995383448979424, "learning_rate": 9.984375205131096e-06, - "loss": 0.2261, + "loss": 0.2282, "step": 184 }, { "epoch": 0.0253077975376197, - "grad_norm": 2.5027176781595086, + "grad_norm": 2.3603901425049596, "learning_rate": 9.984204998614217e-06, - "loss": 0.2224, + "loss": 0.22, "step": 185 }, { "epoch": 0.025444596443228454, - "grad_norm": 2.3759156908375267, + "grad_norm": 2.2783414912212017, "learning_rate": 9.984033871518727e-06, - "loss": 0.2539, + "loss": 0.2555, "step": 186 }, { "epoch": 0.02558139534883721, - "grad_norm": 2.2457031744851075, + "grad_norm": 2.1663586764648493, "learning_rate": 9.983861823876231e-06, - "loss": 0.2415, + "loss": 0.2386, "step": 187 }, { "epoch": 0.025718194254445964, - "grad_norm": 2.1666328725630137, + "grad_norm": 2.034743157083004, "learning_rate": 9.983688855718504e-06, - "loss": 0.2359, + "loss": 0.2374, "step": 188 }, { "epoch": 0.02585499316005472, - "grad_norm": 2.406514948252065, + "grad_norm": 2.4283690614707347, "learning_rate": 9.983514967077496e-06, - "loss": 0.2726, + "loss": 0.2656, "step": 189 }, { "epoch": 0.025991792065663474, - "grad_norm": 2.0758255115247093, + "grad_norm": 2.045174310313461, "learning_rate": 9.983340157985323e-06, - "loss": 0.2611, + "loss": 0.2663, "step": 190 }, { "epoch": 0.02612859097127223, - "grad_norm": 2.102461984017628, + "grad_norm": 2.056409277447373, "learning_rate": 9.983164428474272e-06, - "loss": 0.257, + "loss": 0.2557, "step": 191 }, { "epoch": 0.026265389876880985, - "grad_norm": 1.9439455943320123, + "grad_norm": 1.8977446682643555, "learning_rate": 9.982987778576802e-06, - "loss": 0.2234, + "loss": 0.2326, "step": 192 }, { "epoch": 0.02640218878248974, - "grad_norm": 1.9036283161213776, + "grad_norm": 1.836444529259727, "learning_rate": 9.982810208325537e-06, - "loss": 0.1869, + "loss": 0.188, "step": 193 }, { "epoch": 0.026538987688098495, - "grad_norm": 2.454261201340711, + "grad_norm": 2.3280151607237345, "learning_rate": 9.982631717753275e-06, - "loss": 0.2231, + "loss": 0.2216, "step": 194 }, { "epoch": 0.02667578659370725, - "grad_norm": 1.9586890034064712, + "grad_norm": 1.8664975809196978, "learning_rate": 9.982452306892983e-06, - "loss": 0.2278, + "loss": 0.2291, "step": 195 }, { "epoch": 0.026812585499316005, - "grad_norm": 2.184542514598025, + "grad_norm": 2.1096617672145723, "learning_rate": 9.9822719757778e-06, - "loss": 0.2471, + "loss": 0.2528, "step": 196 }, { "epoch": 0.02694938440492476, - "grad_norm": 2.6379103160112978, + "grad_norm": 2.006104080141036, "learning_rate": 9.982090724441032e-06, - "loss": 0.2499, + "loss": 0.2514, "step": 197 }, { "epoch": 0.027086183310533515, - "grad_norm": 2.0092329423235875, + "grad_norm": 1.950161073678438, "learning_rate": 9.981908552916152e-06, - "loss": 0.2577, + "loss": 0.2594, "step": 198 }, { "epoch": 0.02722298221614227, - "grad_norm": 1.980445180251724, + "grad_norm": 1.875091544694521, "learning_rate": 9.981725461236814e-06, - "loss": 0.2235, + "loss": 0.2202, "step": 199 }, { "epoch": 0.027359781121751026, - "grad_norm": 2.5549224296354223, + "grad_norm": 2.479094130531841, "learning_rate": 9.98154144943683e-06, - "loss": 0.2402, + "loss": 0.2417, "step": 200 }, { "epoch": 0.027359781121751026, - "eval_loss": 0.24215397238731384, - "eval_runtime": 5.9159, - "eval_samples_per_second": 5.071, - "eval_steps_per_second": 1.352, + "eval_loss": 0.24399584531784058, + "eval_runtime": 5.9083, + "eval_samples_per_second": 5.078, + "eval_steps_per_second": 1.354, "step": 200 }, { "epoch": 0.02749658002735978, - "grad_norm": 2.121152922270143, + "grad_norm": 2.0288665456044144, "learning_rate": 9.981356517550189e-06, - "loss": 0.2066, + "loss": 0.2062, "step": 201 }, { "epoch": 0.027633378932968536, - "grad_norm": 2.059953439881523, + "grad_norm": 2.0225133961919215, "learning_rate": 9.981170665611046e-06, - "loss": 0.2046, + "loss": 0.2094, "step": 202 }, { "epoch": 0.02777017783857729, - "grad_norm": 2.2888508315223044, + "grad_norm": 2.1364836806551253, "learning_rate": 9.980983893653729e-06, - "loss": 0.2409, + "loss": 0.246, "step": 203 }, { "epoch": 0.027906976744186046, - "grad_norm": 2.3109063315151257, + "grad_norm": 2.2375632852514102, "learning_rate": 9.980796201712734e-06, - "loss": 0.2575, + "loss": 0.2636, "step": 204 }, { "epoch": 0.0280437756497948, - "grad_norm": 2.205637051660748, + "grad_norm": 2.125254332115355, "learning_rate": 9.980607589822729e-06, - "loss": 0.258, + "loss": 0.2565, "step": 205 }, { "epoch": 0.028180574555403556, - "grad_norm": 1.957406232418839, + "grad_norm": 1.8868048534471666, "learning_rate": 9.980418058018548e-06, - "loss": 0.213, + "loss": 0.2119, "step": 206 }, { "epoch": 0.02831737346101231, - "grad_norm": 3.544643141781388, + "grad_norm": 1.995263778040028, "learning_rate": 9.980227606335198e-06, - "loss": 0.2363, + "loss": 0.2285, "step": 207 }, { "epoch": 0.028454172366621067, - "grad_norm": 2.8164153656264994, + "grad_norm": 2.7073981241149383, "learning_rate": 9.980036234807858e-06, - "loss": 0.3095, + "loss": 0.3164, "step": 208 }, { "epoch": 0.028590971272229822, - "grad_norm": 2.097614953802278, + "grad_norm": 1.782357641285576, "learning_rate": 9.97984394347187e-06, - "loss": 0.2158, + "loss": 0.2187, "step": 209 }, { "epoch": 0.028727770177838577, - "grad_norm": 1.9669891140376192, + "grad_norm": 1.8248859660503127, "learning_rate": 9.979650732362754e-06, - "loss": 0.2202, + "loss": 0.2216, "step": 210 }, { "epoch": 0.028864569083447332, - "grad_norm": 2.325468978765222, + "grad_norm": 2.1917275892222725, "learning_rate": 9.979456601516192e-06, - "loss": 0.2315, + "loss": 0.2284, "step": 211 }, { "epoch": 0.029001367989056087, - "grad_norm": 2.3111784976419196, + "grad_norm": 2.18417304852076, "learning_rate": 9.979261550968042e-06, - "loss": 0.2668, + "loss": 0.2734, "step": 212 }, { "epoch": 0.029138166894664842, - "grad_norm": 2.4040399140282727, + "grad_norm": 2.252847446593963, "learning_rate": 9.979065580754332e-06, - "loss": 0.2675, + "loss": 0.2708, "step": 213 }, { "epoch": 0.029274965800273597, - "grad_norm": 2.3735710881327745, + "grad_norm": 2.2734618003243114, "learning_rate": 9.978868690911252e-06, - "loss": 0.2743, + "loss": 0.2722, "step": 214 }, { "epoch": 0.029411764705882353, - "grad_norm": 1.6560366710919767, + "grad_norm": 1.6002029698082898, "learning_rate": 9.978670881475173e-06, - "loss": 0.2023, + "loss": 0.204, "step": 215 }, { "epoch": 0.029548563611491108, - "grad_norm": 2.1882671992444354, + "grad_norm": 2.0325983102553873, "learning_rate": 9.978472152482628e-06, - "loss": 0.2003, + "loss": 0.2008, "step": 216 }, { "epoch": 0.029685362517099863, - "grad_norm": 1.7824876543980512, + "grad_norm": 1.737509960275409, "learning_rate": 9.97827250397032e-06, - "loss": 0.2175, + "loss": 0.2199, "step": 217 }, { "epoch": 0.029822161422708618, - "grad_norm": 1.7048892440915757, + "grad_norm": 1.5636157609805281, "learning_rate": 9.978071935975126e-06, - "loss": 0.1928, + "loss": 0.1896, "step": 218 }, { "epoch": 0.029958960328317373, - "grad_norm": 2.0020691101719708, + "grad_norm": 1.887910053768012, "learning_rate": 9.977870448534091e-06, - "loss": 0.2636, + "loss": 0.2604, "step": 219 }, { "epoch": 0.030095759233926128, - "grad_norm": 1.8282613131252383, + "grad_norm": 1.8062126872507862, "learning_rate": 9.97766804168443e-06, - "loss": 0.2357, + "loss": 0.2433, "step": 220 }, { "epoch": 0.030232558139534883, - "grad_norm": 2.0422177444630036, + "grad_norm": 1.990217747747273, "learning_rate": 9.977464715463525e-06, - "loss": 0.233, + "loss": 0.237, "step": 221 }, { "epoch": 0.03036935704514364, - "grad_norm": 2.0230741925118054, + "grad_norm": 1.987346747166886, "learning_rate": 9.977260469908931e-06, - "loss": 0.2427, + "loss": 0.2492, "step": 222 }, { "epoch": 0.030506155950752394, - "grad_norm": 1.7082512424882599, + "grad_norm": 1.6214467577194376, "learning_rate": 9.977055305058374e-06, - "loss": 0.1931, + "loss": 0.1988, "step": 223 }, { "epoch": 0.03064295485636115, - "grad_norm": 1.916917277857068, + "grad_norm": 1.7719376804771028, "learning_rate": 9.976849220949747e-06, - "loss": 0.211, + "loss": 0.2036, "step": 224 }, { "epoch": 0.030779753761969904, - "grad_norm": 2.1408414852248634, + "grad_norm": 1.8689976975996654, "learning_rate": 9.976642217621111e-06, - "loss": 0.1935, + "loss": 0.1907, "step": 225 }, { "epoch": 0.03091655266757866, - "grad_norm": 2.394598607151582, + "grad_norm": 2.2064087125436087, "learning_rate": 9.976434295110702e-06, - "loss": 0.2815, + "loss": 0.28, "step": 226 }, { "epoch": 0.031053351573187414, - "grad_norm": 2.2255123871092946, + "grad_norm": 2.079203767699193, "learning_rate": 9.976225453456923e-06, - "loss": 0.218, + "loss": 0.223, "step": 227 }, { "epoch": 0.03119015047879617, - "grad_norm": 2.143566558124061, + "grad_norm": 2.061634317913584, "learning_rate": 9.976015692698347e-06, "loss": 0.2518, "step": 228 }, { "epoch": 0.031326949384404924, - "grad_norm": 2.0393190117599413, + "grad_norm": 1.926652849527281, "learning_rate": 9.975805012873714e-06, - "loss": 0.2223, + "loss": 0.2208, "step": 229 }, { "epoch": 0.03146374829001368, - "grad_norm": 2.141480639679854, + "grad_norm": 2.009544769703427, "learning_rate": 9.97559341402194e-06, - "loss": 0.2577, + "loss": 0.2552, "step": 230 }, { "epoch": 0.031600547195622435, - "grad_norm": 1.8520404238455603, + "grad_norm": 1.7548908755565247, "learning_rate": 9.975380896182105e-06, - "loss": 0.2017, + "loss": 0.2023, "step": 231 }, { "epoch": 0.03173734610123119, - "grad_norm": 1.7798250555143964, + "grad_norm": 1.7168480371662005, "learning_rate": 9.97516745939346e-06, - "loss": 0.2197, + "loss": 0.2206, "step": 232 }, { "epoch": 0.031874145006839945, - "grad_norm": 2.413651839622086, + "grad_norm": 2.3690370835673424, "learning_rate": 9.974953103695428e-06, - "loss": 0.2701, + "loss": 0.2716, "step": 233 }, { "epoch": 0.0320109439124487, - "grad_norm": 2.2654484184591115, + "grad_norm": 2.220235263154576, "learning_rate": 9.974737829127603e-06, - "loss": 0.2785, + "loss": 0.2744, "step": 234 }, { "epoch": 0.032147742818057455, - "grad_norm": 2.034019488241997, + "grad_norm": 2.025918975857597, "learning_rate": 9.974521635729742e-06, - "loss": 0.1904, + "loss": 0.1928, "step": 235 }, { "epoch": 0.03228454172366621, - "grad_norm": 2.4472711530735034, + "grad_norm": 2.4013994518285813, "learning_rate": 9.974304523541776e-06, - "loss": 0.2728, + "loss": 0.2812, "step": 236 }, { "epoch": 0.032421340629274965, - "grad_norm": 1.737916448209716, + "grad_norm": 1.7463092358759762, "learning_rate": 9.974086492603808e-06, - "loss": 0.1836, + "loss": 0.1834, "step": 237 }, { "epoch": 0.03255813953488372, - "grad_norm": 1.7997301475757768, + "grad_norm": 1.7730506128059664, "learning_rate": 9.973867542956104e-06, - "loss": 0.207, + "loss": 0.2101, "step": 238 }, { "epoch": 0.032694938440492476, - "grad_norm": 1.7770889893807775, + "grad_norm": 1.6988900231590633, "learning_rate": 9.973647674639109e-06, - "loss": 0.2513, + "loss": 0.2538, "step": 239 }, { "epoch": 0.03283173734610123, - "grad_norm": 2.095580553290577, + "grad_norm": 2.0686636139319994, "learning_rate": 9.973426887693429e-06, - "loss": 0.2241, + "loss": 0.2303, "step": 240 }, { "epoch": 0.032968536251709986, - "grad_norm": 2.299712063398233, + "grad_norm": 2.3005663633310367, "learning_rate": 9.973205182159844e-06, - "loss": 0.256, + "loss": 0.2583, "step": 241 }, { "epoch": 0.03310533515731874, - "grad_norm": 2.1728792768624072, + "grad_norm": 2.1148965981887073, "learning_rate": 9.972982558079302e-06, - "loss": 0.2556, + "loss": 0.262, "step": 242 }, { "epoch": 0.033242134062927496, - "grad_norm": 1.787734759741163, + "grad_norm": 1.9106656107218816, "learning_rate": 9.972759015492925e-06, - "loss": 0.1876, + "loss": 0.1926, "step": 243 }, { "epoch": 0.03337893296853625, - "grad_norm": 2.008664464294264, + "grad_norm": 1.9334611221547606, "learning_rate": 9.972534554441997e-06, - "loss": 0.2404, + "loss": 0.2378, "step": 244 }, { "epoch": 0.033515731874145006, - "grad_norm": 1.965656179379691, + "grad_norm": 1.9126089833552902, "learning_rate": 9.972309174967978e-06, - "loss": 0.2436, + "loss": 0.2475, "step": 245 }, { "epoch": 0.03365253077975376, - "grad_norm": 2.425451569508513, + "grad_norm": 1.916246757440947, "learning_rate": 9.972082877112495e-06, - "loss": 0.2055, + "loss": 0.2062, "step": 246 }, { "epoch": 0.03378932968536252, - "grad_norm": 2.122955207758004, + "grad_norm": 2.072724114530999, "learning_rate": 9.971855660917344e-06, - "loss": 0.2439, + "loss": 0.2489, "step": 247 }, { "epoch": 0.03392612859097127, - "grad_norm": 2.4885422494971046, + "grad_norm": 2.442311377668877, "learning_rate": 9.971627526424492e-06, - "loss": 0.272, + "loss": 0.2719, "step": 248 }, { "epoch": 0.03406292749658003, - "grad_norm": 2.8949414958080686, + "grad_norm": 2.7962396909818366, "learning_rate": 9.971398473676076e-06, - "loss": 0.3223, + "loss": 0.3259, "step": 249 }, { "epoch": 0.03419972640218878, - "grad_norm": 1.9585315750918433, + "grad_norm": 1.8698384668138541, "learning_rate": 9.971168502714403e-06, - "loss": 0.2428, + "loss": 0.2466, "step": 250 }, { "epoch": 0.03433652530779754, - "grad_norm": 2.0260954982261894, + "grad_norm": 1.9380773604914265, "learning_rate": 9.970937613581945e-06, "loss": 0.2324, "step": 251 }, { "epoch": 0.03447332421340629, - "grad_norm": 1.9940738439323322, + "grad_norm": 1.8845279863526938, "learning_rate": 9.97070580632135e-06, - "loss": 0.2589, + "loss": 0.2564, "step": 252 }, { "epoch": 0.03461012311901505, - "grad_norm": 2.156360986443096, + "grad_norm": 2.1472728299561576, "learning_rate": 9.970473080975432e-06, - "loss": 0.2662, + "loss": 0.2697, "step": 253 }, { "epoch": 0.0347469220246238, - "grad_norm": 2.5412898385981797, + "grad_norm": 2.4368605477949283, "learning_rate": 9.970239437587173e-06, - "loss": 0.2716, + "loss": 0.2733, "step": 254 }, { "epoch": 0.03488372093023256, - "grad_norm": 1.6812948140698074, + "grad_norm": 1.5770666815151881, "learning_rate": 9.970004876199731e-06, - "loss": 0.2311, + "loss": 0.2296, "step": 255 }, { "epoch": 0.03502051983584131, - "grad_norm": 2.2078884060543613, + "grad_norm": 2.0554633214768283, "learning_rate": 9.969769396856426e-06, - "loss": 0.2707, + "loss": 0.2698, "step": 256 }, { "epoch": 0.03515731874145007, - "grad_norm": 1.6852407847433288, + "grad_norm": 1.6537303166470652, "learning_rate": 9.969532999600751e-06, - "loss": 0.2089, + "loss": 0.2129, "step": 257 }, { "epoch": 0.03529411764705882, - "grad_norm": 2.0539092493704088, + "grad_norm": 1.9763709641096114, "learning_rate": 9.96929568447637e-06, - "loss": 0.2448, + "loss": 0.2465, "step": 258 }, { "epoch": 0.03543091655266758, - "grad_norm": 1.774744314775827, + "grad_norm": 1.766599203921637, "learning_rate": 9.969057451527113e-06, - "loss": 0.1669, + "loss": 0.1672, "step": 259 }, { "epoch": 0.03556771545827633, - "grad_norm": 2.4562556577937222, + "grad_norm": 4.645830095467641, "learning_rate": 9.968818300796983e-06, - "loss": 0.2561, + "loss": 0.2634, "step": 260 }, { "epoch": 0.03570451436388509, - "grad_norm": 2.467086486012255, + "grad_norm": 10.415782252007627, "learning_rate": 9.968578232330151e-06, - "loss": 0.3077, + "loss": 0.3203, "step": 261 }, { "epoch": 0.035841313269493844, - "grad_norm": 2.112357755323075, + "grad_norm": 2.321422532091201, "learning_rate": 9.968337246170956e-06, - "loss": 0.2242, + "loss": 0.2238, "step": 262 }, { "epoch": 0.0359781121751026, - "grad_norm": 2.092512513127108, + "grad_norm": 2.4044835231032096, "learning_rate": 9.968095342363909e-06, - "loss": 0.2282, + "loss": 0.2348, "step": 263 }, { "epoch": 0.036114911080711354, - "grad_norm": 2.011629580180064, + "grad_norm": 2.0176384470642748, "learning_rate": 9.967852520953688e-06, - "loss": 0.2235, + "loss": 0.2268, "step": 264 }, { "epoch": 0.03625170998632011, - "grad_norm": 2.0958663750441904, + "grad_norm": 2.0193787692926906, "learning_rate": 9.967608781985145e-06, - "loss": 0.2297, + "loss": 0.2307, "step": 265 }, { "epoch": 0.036388508891928864, - "grad_norm": 1.6979939263460286, + "grad_norm": 1.6613178749978548, "learning_rate": 9.967364125503297e-06, - "loss": 0.195, + "loss": 0.2019, "step": 266 }, { "epoch": 0.03652530779753762, - "grad_norm": 2.1368144133990694, + "grad_norm": 2.2022667181920275, "learning_rate": 9.96711855155333e-06, - "loss": 0.2386, + "loss": 0.2426, "step": 267 }, { "epoch": 0.036662106703146374, - "grad_norm": 1.6736707817270833, + "grad_norm": 1.9445543918391148, "learning_rate": 9.966872060180601e-06, - "loss": 0.2178, + "loss": 0.2162, "step": 268 }, { "epoch": 0.03679890560875513, - "grad_norm": 2.0905698566513338, + "grad_norm": 2.0695951842688185, "learning_rate": 9.96662465143064e-06, - "loss": 0.2376, + "loss": 0.2405, "step": 269 }, { "epoch": 0.036935704514363885, - "grad_norm": 2.124436709392062, + "grad_norm": 2.093688451603031, "learning_rate": 9.966376325349142e-06, - "loss": 0.2667, + "loss": 0.2712, "step": 270 }, { "epoch": 0.03707250341997264, - "grad_norm": 2.3223866558207473, + "grad_norm": 4.112269060498306, "learning_rate": 9.966127081981973e-06, - "loss": 0.305, + "loss": 0.3146, "step": 271 }, { "epoch": 0.037209302325581395, - "grad_norm": 1.9333057422067041, + "grad_norm": 2.2032135111265214, "learning_rate": 9.965876921375165e-06, - "loss": 0.2056, + "loss": 0.2029, "step": 272 }, { "epoch": 0.03734610123119015, - "grad_norm": 2.7010371594027793, + "grad_norm": 2.732110428569243, "learning_rate": 9.965625843574927e-06, - "loss": 0.2917, + "loss": 0.3047, "step": 273 }, { "epoch": 0.037482900136798905, - "grad_norm": 2.359743114261322, + "grad_norm": 3.2150250451260307, "learning_rate": 9.965373848627631e-06, - "loss": 0.2886, + "loss": 0.3013, "step": 274 }, { "epoch": 0.03761969904240766, - "grad_norm": 2.196611686937064, + "grad_norm": 2.069269278139934, "learning_rate": 9.965120936579819e-06, - "loss": 0.2486, + "loss": 0.2527, "step": 275 }, { "epoch": 0.037756497948016415, - "grad_norm": 2.221486253573749, + "grad_norm": 2.1315109710514832, "learning_rate": 9.964867107478205e-06, - "loss": 0.2245, + "loss": 0.2194, "step": 276 }, { "epoch": 0.03789329685362517, - "grad_norm": 143.33951065173977, + "grad_norm": 2.168952961790728, "learning_rate": 9.964612361369669e-06, - "loss": 1.1755, + "loss": 0.2871, "step": 277 }, { "epoch": 0.038030095759233926, - "grad_norm": 130.67274065362903, + "grad_norm": 1.9980524652780165, "learning_rate": 9.964356698301265e-06, - "loss": 0.7085, + "loss": 0.2815, "step": 278 }, { "epoch": 0.03816689466484268, - "grad_norm": 64.32918136171229, + "grad_norm": 1.7073967156719894, "learning_rate": 9.964100118320213e-06, - "loss": 0.4809, + "loss": 0.209, "step": 279 }, { "epoch": 0.038303693570451436, - "grad_norm": 2.6494543411013893, + "grad_norm": 1.949277054281621, "learning_rate": 9.963842621473902e-06, - "loss": 0.1995, + "loss": 0.1817, "step": 280 }, { "epoch": 0.03844049247606019, - "grad_norm": 2.8212837096173287, + "grad_norm": 1.8397918621586407, "learning_rate": 9.963584207809893e-06, - "loss": 0.2074, + "loss": 0.1927, "step": 281 }, { "epoch": 0.038577291381668946, - "grad_norm": 2.4030187459997085, + "grad_norm": 2.3464195748569003, "learning_rate": 9.963324877375913e-06, - "loss": 0.2921, + "loss": 0.2906, "step": 282 }, { "epoch": 0.0387140902872777, - "grad_norm": 2.296243991814418, + "grad_norm": 2.239362729912614, "learning_rate": 9.963064630219863e-06, - "loss": 0.2105, + "loss": 0.2022, "step": 283 }, { "epoch": 0.038850889192886456, - "grad_norm": 2.2720694097646694, + "grad_norm": 1.790427259817598, "learning_rate": 9.962803466389807e-06, - "loss": 0.2095, + "loss": 0.1982, "step": 284 }, { "epoch": 0.03898768809849521, - "grad_norm": 2.579705408238439, + "grad_norm": 1.92466884123947, "learning_rate": 9.962541385933985e-06, - "loss": 0.2661, + "loss": 0.2476, "step": 285 }, { "epoch": 0.03912448700410397, - "grad_norm": 1.7366968259366162, + "grad_norm": 1.5636214879471446, "learning_rate": 9.9622783889008e-06, - "loss": 0.1843, + "loss": 0.1772, "step": 286 }, { "epoch": 0.03926128590971272, - "grad_norm": 1.9090804900655005, + "grad_norm": 1.776621635988747, "learning_rate": 9.96201447533883e-06, - "loss": 0.2208, + "loss": 0.2222, "step": 287 }, { "epoch": 0.03939808481532148, - "grad_norm": 1.878528824650176, + "grad_norm": 1.9283603160284128, "learning_rate": 9.961749645296818e-06, - "loss": 0.2053, + "loss": 0.203, "step": 288 }, { "epoch": 0.03953488372093023, - "grad_norm": 2.1500775177206553, + "grad_norm": 1.9282454124168786, "learning_rate": 9.961483898823679e-06, - "loss": 0.246, + "loss": 0.2368, "step": 289 }, { "epoch": 0.03967168262653899, - "grad_norm": 1.789565833201305, + "grad_norm": 1.822002293207974, "learning_rate": 9.961217235968494e-06, - "loss": 0.2218, + "loss": 0.2236, "step": 290 }, { "epoch": 0.03980848153214774, - "grad_norm": 2.149697506311854, + "grad_norm": 1.9579460627466527, "learning_rate": 9.960949656780517e-06, - "loss": 0.2575, + "loss": 0.2547, "step": 291 }, { "epoch": 0.0399452804377565, - "grad_norm": 2.013842831981254, + "grad_norm": 1.9820345063116875, "learning_rate": 9.960681161309169e-06, - "loss": 0.2138, + "loss": 0.2163, "step": 292 }, { "epoch": 0.04008207934336525, - "grad_norm": 2.158035360781972, + "grad_norm": 1.9988214354429958, "learning_rate": 9.960411749604043e-06, - "loss": 0.2325, + "loss": 0.2312, "step": 293 }, { "epoch": 0.04021887824897401, - "grad_norm": 1.7899173321854651, + "grad_norm": 1.679371406667924, "learning_rate": 9.960141421714897e-06, - "loss": 0.2047, + "loss": 0.2058, "step": 294 }, { "epoch": 0.04035567715458276, - "grad_norm": 1.910098576180444, + "grad_norm": 3.2183885944290744, "learning_rate": 9.959870177691662e-06, - "loss": 0.2577, + "loss": 0.263, "step": 295 }, { "epoch": 0.04049247606019152, - "grad_norm": 1.6563987122728452, + "grad_norm": 1.6443039131048283, "learning_rate": 9.959598017584433e-06, - "loss": 0.1752, + "loss": 0.1803, "step": 296 }, { "epoch": 0.04062927496580027, - "grad_norm": 1.9662184218503622, + "grad_norm": 1.9511444325007314, "learning_rate": 9.959324941443482e-06, - "loss": 0.224, + "loss": 0.2231, "step": 297 }, { "epoch": 0.04076607387140903, - "grad_norm": 1.8241769267013732, + "grad_norm": 1.8488068285074206, "learning_rate": 9.959050949319244e-06, - "loss": 0.2425, + "loss": 0.247, "step": 298 }, { "epoch": 0.04090287277701778, - "grad_norm": 2.0181256694217926, + "grad_norm": 1.9498023610114024, "learning_rate": 9.958776041262325e-06, - "loss": 0.2253, + "loss": 0.2289, "step": 299 }, { "epoch": 0.04103967168262654, - "grad_norm": 2.056023361432569, + "grad_norm": 1.9128205453654543, "learning_rate": 9.9585002173235e-06, - "loss": 0.2016, + "loss": 0.191, "step": 300 }, { "epoch": 0.04103967168262654, - "eval_loss": 0.23443549871444702, - "eval_runtime": 5.9209, - "eval_samples_per_second": 5.067, - "eval_steps_per_second": 1.351, + "eval_loss": 0.23458829522132874, + "eval_runtime": 5.932, + "eval_samples_per_second": 5.057, + "eval_steps_per_second": 1.349, "step": 300 }, { "epoch": 0.041176470588235294, - "grad_norm": 1.844975748763809, + "grad_norm": 1.778750944349959, "learning_rate": 9.958223477553715e-06, - "loss": 0.2044, + "loss": 0.2065, "step": 301 }, { "epoch": 0.04131326949384405, - "grad_norm": 2.0135411079868644, + "grad_norm": 2.054647182306007, "learning_rate": 9.957945822004082e-06, - "loss": 0.2235, + "loss": 0.2266, "step": 302 }, { "epoch": 0.041450068399452804, - "grad_norm": 2.100449373176096, + "grad_norm": 2.009864784206668, "learning_rate": 9.957667250725883e-06, - "loss": 0.2377, + "loss": 0.2418, "step": 303 }, { "epoch": 0.04158686730506156, - "grad_norm": 2.2132922953188743, + "grad_norm": 2.1679551253720346, "learning_rate": 9.957387763770574e-06, - "loss": 0.2553, + "loss": 0.2574, "step": 304 }, { "epoch": 0.041723666210670314, - "grad_norm": 2.2451346321602936, + "grad_norm": 2.0958074503538016, "learning_rate": 9.957107361189772e-06, - "loss": 0.2332, + "loss": 0.2292, "step": 305 }, { "epoch": 0.04186046511627907, - "grad_norm": 2.1842854489853525, + "grad_norm": 2.109933716374929, "learning_rate": 9.956826043035268e-06, - "loss": 0.2433, + "loss": 0.2464, "step": 306 }, { "epoch": 0.041997264021887824, - "grad_norm": 2.0933603159626823, + "grad_norm": 2.046131043478915, "learning_rate": 9.956543809359022e-06, - "loss": 0.2509, + "loss": 0.256, "step": 307 }, { "epoch": 0.04213406292749658, - "grad_norm": 1.759295392675799, + "grad_norm": 1.6909610153425558, "learning_rate": 9.956260660213163e-06, - "loss": 0.1944, + "loss": 0.1939, "step": 308 }, { "epoch": 0.042270861833105335, - "grad_norm": 2.0157758275491284, + "grad_norm": 1.8798225433689015, "learning_rate": 9.955976595649986e-06, - "loss": 0.2534, + "loss": 0.2581, "step": 309 }, { "epoch": 0.04240766073871409, - "grad_norm": 1.83291076511772, + "grad_norm": 1.730264163650525, "learning_rate": 9.95569161572196e-06, - "loss": 0.1988, + "loss": 0.2018, "step": 310 }, { "epoch": 0.042544459644322845, - "grad_norm": 2.0340069296787098, + "grad_norm": 2.0058631205324753, "learning_rate": 9.955405720481719e-06, - "loss": 0.1847, + "loss": 0.1861, "step": 311 }, { "epoch": 0.0426812585499316, - "grad_norm": 1.731485603958649, + "grad_norm": 1.665273581274879, "learning_rate": 9.955118909982067e-06, - "loss": 0.2262, + "loss": 0.2265, "step": 312 }, { "epoch": 0.042818057455540355, - "grad_norm": 2.0965494576326935, + "grad_norm": 2.1246878903139743, "learning_rate": 9.95483118427598e-06, - "loss": 0.2083, + "loss": 0.2109, "step": 313 }, { "epoch": 0.04295485636114911, - "grad_norm": 2.143382182910062, + "grad_norm": 2.0798557710529204, "learning_rate": 9.954542543416599e-06, - "loss": 0.2525, + "loss": 0.2574, "step": 314 }, { "epoch": 0.043091655266757865, - "grad_norm": 1.5752509286270873, + "grad_norm": 1.4716776854117366, "learning_rate": 9.954252987457236e-06, - "loss": 0.1629, + "loss": 0.1599, "step": 315 }, { "epoch": 0.04322845417236662, - "grad_norm": 2.44327796661116, + "grad_norm": 2.2634129712860704, "learning_rate": 9.953962516451373e-06, - "loss": 0.2349, + "loss": 0.2342, "step": 316 }, { "epoch": 0.043365253077975376, - "grad_norm": 2.286750411652927, + "grad_norm": 2.1857595078803254, "learning_rate": 9.953671130452657e-06, - "loss": 0.2237, + "loss": 0.2174, "step": 317 }, { "epoch": 0.04350205198358413, - "grad_norm": 2.24161844806648, + "grad_norm": 2.173999637858282, "learning_rate": 9.953378829514908e-06, - "loss": 0.2817, + "loss": 0.2844, "step": 318 }, { "epoch": 0.043638850889192886, - "grad_norm": 1.885942760755679, + "grad_norm": 1.829541895868393, "learning_rate": 9.953085613692116e-06, - "loss": 0.2418, + "loss": 0.2417, "step": 319 }, { "epoch": 0.04377564979480164, - "grad_norm": 2.001995941469455, + "grad_norm": 1.8241085191100084, "learning_rate": 9.952791483038435e-06, - "loss": 0.228, + "loss": 0.2256, "step": 320 }, { "epoch": 0.043912448700410396, - "grad_norm": 2.0947207864045776, + "grad_norm": 2.0507969460459035, "learning_rate": 9.952496437608192e-06, - "loss": 0.2219, + "loss": 0.2209, "step": 321 }, { "epoch": 0.04404924760601915, - "grad_norm": 2.2271139065862897, + "grad_norm": 2.15669194067522, "learning_rate": 9.952200477455882e-06, - "loss": 0.2333, + "loss": 0.2389, "step": 322 }, { "epoch": 0.044186046511627906, - "grad_norm": 1.9997414872437538, + "grad_norm": 1.9141438867592786, "learning_rate": 9.951903602636166e-06, - "loss": 0.2673, + "loss": 0.2653, "step": 323 }, { "epoch": 0.04432284541723666, - "grad_norm": 1.9102399374134824, + "grad_norm": 1.828622363784387, "learning_rate": 9.95160581320388e-06, - "loss": 0.2614, + "loss": 0.2589, "step": 324 }, { "epoch": 0.04445964432284542, - "grad_norm": 1.8020728593839417, + "grad_norm": 1.754515166171007, "learning_rate": 9.951307109214024e-06, - "loss": 0.1838, + "loss": 0.1853, "step": 325 }, { "epoch": 0.04459644322845417, - "grad_norm": 1.8539073579226673, + "grad_norm": 1.729300501859657, "learning_rate": 9.951007490721766e-06, - "loss": 0.1966, + "loss": 0.198, "step": 326 }, { "epoch": 0.04473324213406293, - "grad_norm": 2.074516445768306, + "grad_norm": 1.9244895839659504, "learning_rate": 9.95070695778245e-06, - "loss": 0.2526, + "loss": 0.2479, "step": 327 }, { "epoch": 0.04487004103967168, - "grad_norm": 1.6216533911647293, + "grad_norm": 1.6079005952646885, "learning_rate": 9.950405510451579e-06, - "loss": 0.2126, + "loss": 0.2183, "step": 328 }, { "epoch": 0.04500683994528044, - "grad_norm": 2.1027931743155035, + "grad_norm": 2.0324732679850426, "learning_rate": 9.950103148784835e-06, - "loss": 0.2115, + "loss": 0.2124, "step": 329 }, { "epoch": 0.04514363885088919, - "grad_norm": 2.1021951074484138, + "grad_norm": 2.025853091507491, "learning_rate": 9.949799872838062e-06, - "loss": 0.2197, + "loss": 0.2161, "step": 330 }, { "epoch": 0.04528043775649795, - "grad_norm": 1.8934179810730225, + "grad_norm": 1.8344090840567413, "learning_rate": 9.949495682667274e-06, - "loss": 0.2618, + "loss": 0.2617, "step": 331 }, { "epoch": 0.0454172366621067, - "grad_norm": 1.81398987540968, + "grad_norm": 1.67927934991538, "learning_rate": 9.949190578328655e-06, - "loss": 0.2173, + "loss": 0.2085, "step": 332 }, { "epoch": 0.04555403556771546, - "grad_norm": 2.498915347282016, + "grad_norm": 2.4273999476971655, "learning_rate": 9.948884559878558e-06, - "loss": 0.2519, + "loss": 0.251, "step": 333 }, { "epoch": 0.04569083447332421, - "grad_norm": 1.668344917940554, + "grad_norm": 1.6107292389254773, "learning_rate": 9.948577627373503e-06, - "loss": 0.1922, + "loss": 0.1903, "step": 334 }, { "epoch": 0.04582763337893297, - "grad_norm": 1.6615844882943134, + "grad_norm": 1.5972724979989572, "learning_rate": 9.948269780870183e-06, "loss": 0.2099, "step": 335 }, { "epoch": 0.04596443228454172, - "grad_norm": 1.9266695871666053, + "grad_norm": 1.811431642605157, "learning_rate": 9.947961020425454e-06, - "loss": 0.2324, + "loss": 0.2344, "step": 336 }, { "epoch": 0.04610123119015048, - "grad_norm": 1.7971176105318436, + "grad_norm": 1.821887800694013, "learning_rate": 9.947651346096347e-06, - "loss": 0.2118, + "loss": 0.2175, "step": 337 }, { "epoch": 0.04623803009575923, - "grad_norm": 2.040936577297317, + "grad_norm": 2.030737191733337, "learning_rate": 9.947340757940053e-06, - "loss": 0.262, + "loss": 0.2582, "step": 338 }, { "epoch": 0.04637482900136799, - "grad_norm": 1.9472150972138946, + "grad_norm": 1.8739031627783413, "learning_rate": 9.947029256013946e-06, - "loss": 0.2601, + "loss": 0.2597, "step": 339 }, { "epoch": 0.046511627906976744, - "grad_norm": 2.2190629950291503, + "grad_norm": 2.1438247290517336, "learning_rate": 9.946716840375552e-06, - "loss": 0.3, + "loss": 0.3035, "step": 340 }, { "epoch": 0.0466484268125855, - "grad_norm": 2.011305223468965, + "grad_norm": 1.9407987937014775, "learning_rate": 9.946403511082579e-06, - "loss": 0.2525, + "loss": 0.2515, "step": 341 }, { "epoch": 0.046785225718194254, - "grad_norm": 2.236356592015844, + "grad_norm": 2.156210626393506, "learning_rate": 9.946089268192894e-06, - "loss": 0.289, + "loss": 0.2884, "step": 342 }, { "epoch": 0.04692202462380301, - "grad_norm": 2.0315394212511224, + "grad_norm": 2.003960521048137, "learning_rate": 9.945774111764542e-06, - "loss": 0.2346, + "loss": 0.2363, "step": 343 }, { "epoch": 0.047058823529411764, - "grad_norm": 2.028003180979977, + "grad_norm": 1.9175224817440062, "learning_rate": 9.945458041855732e-06, - "loss": 0.2224, + "loss": 0.2227, "step": 344 }, { "epoch": 0.04719562243502052, - "grad_norm": 2.0331195782593205, + "grad_norm": 1.997998735589759, "learning_rate": 9.945141058524836e-06, - "loss": 0.2922, + "loss": 0.2933, "step": 345 }, { "epoch": 0.047332421340629274, - "grad_norm": 2.013640438116337, + "grad_norm": 1.9033144241379996, "learning_rate": 9.944823161830408e-06, - "loss": 0.2871, + "loss": 0.2855, "step": 346 }, { "epoch": 0.04746922024623803, - "grad_norm": 1.56843629318218, + "grad_norm": 1.4798239821392603, "learning_rate": 9.944504351831161e-06, - "loss": 0.17, + "loss": 0.1704, "step": 347 }, { "epoch": 0.047606019151846785, - "grad_norm": 1.4921992372332498, + "grad_norm": 1.4590539465227648, "learning_rate": 9.944184628585976e-06, - "loss": 0.2033, + "loss": 0.2057, "step": 348 }, { "epoch": 0.04774281805745554, - "grad_norm": 2.4478503112395233, + "grad_norm": 2.349983883113196, "learning_rate": 9.943863992153906e-06, - "loss": 0.3013, + "loss": 0.3041, "step": 349 }, { "epoch": 0.047879616963064295, - "grad_norm": 2.019040608318348, + "grad_norm": 1.9786102604764306, "learning_rate": 9.943542442594177e-06, - "loss": 0.2295, + "loss": 0.2297, "step": 350 }, { "epoch": 0.04801641586867305, - "grad_norm": 1.5204434710725532, + "grad_norm": 1.4367426965970163, "learning_rate": 9.943219979966175e-06, - "loss": 0.2035, + "loss": 0.2028, "step": 351 }, { "epoch": 0.048153214774281805, - "grad_norm": 2.02047919443974, + "grad_norm": 1.9263489042890374, "learning_rate": 9.94289660432946e-06, - "loss": 0.287, + "loss": 0.2836, "step": 352 }, { "epoch": 0.04829001367989056, - "grad_norm": 1.6359737102188687, + "grad_norm": 1.6097095453406145, "learning_rate": 9.942572315743758e-06, - "loss": 0.2065, + "loss": 0.2026, "step": 353 }, { "epoch": 0.048426812585499315, - "grad_norm": 1.7812270153115595, + "grad_norm": 1.7623912363171326, "learning_rate": 9.942247114268964e-06, - "loss": 0.2427, + "loss": 0.2435, "step": 354 }, { "epoch": 0.04856361149110807, - "grad_norm": 1.888800068768115, + "grad_norm": 1.8464179020786433, "learning_rate": 9.941920999965146e-06, - "loss": 0.2049, + "loss": 0.203, "step": 355 }, { "epoch": 0.048700410396716826, - "grad_norm": 1.6969961049186935, + "grad_norm": 1.663335860127645, "learning_rate": 9.941593972892533e-06, - "loss": 0.2197, + "loss": 0.2222, "step": 356 }, { "epoch": 0.04883720930232558, - "grad_norm": 1.57545297987961, + "grad_norm": 1.538106065334765, "learning_rate": 9.94126603311153e-06, - "loss": 0.2187, + "loss": 0.2184, "step": 357 }, { "epoch": 0.048974008207934336, - "grad_norm": 1.5934061602214038, + "grad_norm": 1.5654815043689074, "learning_rate": 9.940937180682707e-06, - "loss": 0.2409, + "loss": 0.241, "step": 358 }, { "epoch": 0.04911080711354309, - "grad_norm": 2.036139007915374, + "grad_norm": 1.9234804711490059, "learning_rate": 9.9406074156668e-06, - "loss": 0.2081, + "loss": 0.2089, "step": 359 }, { "epoch": 0.049247606019151846, - "grad_norm": 1.7904739985073308, + "grad_norm": 1.6758029165459014, "learning_rate": 9.940276738124718e-06, - "loss": 0.201, + "loss": 0.1986, "step": 360 }, { "epoch": 0.0493844049247606, - "grad_norm": 1.5805354335049175, + "grad_norm": 1.5377715965242809, "learning_rate": 9.939945148117537e-06, - "loss": 0.2128, + "loss": 0.2112, "step": 361 }, { "epoch": 0.049521203830369356, - "grad_norm": 2.101569454164615, + "grad_norm": 2.0712441180500565, "learning_rate": 9.9396126457065e-06, - "loss": 0.3155, + "loss": 0.3172, "step": 362 }, { "epoch": 0.04965800273597811, - "grad_norm": 1.7105452252734583, + "grad_norm": 1.6745144196538269, "learning_rate": 9.939279230953024e-06, - "loss": 0.2272, + "loss": 0.2299, "step": 363 }, { "epoch": 0.04979480164158687, - "grad_norm": 2.1797086684149916, + "grad_norm": 2.09404627800826, "learning_rate": 9.938944903918687e-06, - "loss": 0.258, + "loss": 0.2589, "step": 364 }, { "epoch": 0.04993160054719562, - "grad_norm": 1.817224012058259, + "grad_norm": 1.7846049986373285, "learning_rate": 9.938609664665237e-06, - "loss": 0.2536, + "loss": 0.2588, "step": 365 }, { "epoch": 0.05006839945280438, - "grad_norm": 2.0328919425201315, + "grad_norm": 1.9773403462096837, "learning_rate": 9.938273513254597e-06, - "loss": 0.2124, + "loss": 0.2095, "step": 366 }, { "epoch": 0.05020519835841313, - "grad_norm": 1.759137917352943, + "grad_norm": 1.7097408194897106, "learning_rate": 9.93793644974885e-06, - "loss": 0.2133, + "loss": 0.213, "step": 367 }, { "epoch": 0.05034199726402189, - "grad_norm": 1.932832572519704, + "grad_norm": 1.8262281853435829, "learning_rate": 9.937598474210254e-06, - "loss": 0.2176, + "loss": 0.22, "step": 368 }, { "epoch": 0.05047879616963064, - "grad_norm": 2.0525298798141884, + "grad_norm": 1.9206064570944246, "learning_rate": 9.937259586701233e-06, - "loss": 0.2046, + "loss": 0.2048, "step": 369 }, { "epoch": 0.0506155950752394, - "grad_norm": 1.7836631926424744, + "grad_norm": 1.7685074223217374, "learning_rate": 9.936919787284378e-06, - "loss": 0.2076, + "loss": 0.2085, "step": 370 }, { "epoch": 0.05075239398084815, - "grad_norm": 1.9825893179616614, + "grad_norm": 1.914011994872317, "learning_rate": 9.93657907602245e-06, - "loss": 0.2388, + "loss": 0.244, "step": 371 }, { "epoch": 0.05088919288645691, - "grad_norm": 1.47648132682407, + "grad_norm": 1.4759192222393895, "learning_rate": 9.936237452978376e-06, - "loss": 0.203, + "loss": 0.2079, "step": 372 }, { "epoch": 0.05102599179206566, - "grad_norm": 1.7769671642716924, + "grad_norm": 1.7461634650660636, "learning_rate": 9.93589491821526e-06, - "loss": 0.2102, + "loss": 0.2097, "step": 373 }, { "epoch": 0.05116279069767442, - "grad_norm": 1.7541393042452413, + "grad_norm": 1.6897449224968506, "learning_rate": 9.935551471796358e-06, - "loss": 0.2124, + "loss": 0.2067, "step": 374 }, { "epoch": 0.05129958960328317, - "grad_norm": 1.9385604124562659, + "grad_norm": 1.9089949163019013, "learning_rate": 9.935207113785112e-06, - "loss": 0.2563, + "loss": 0.262, "step": 375 }, { "epoch": 0.05143638850889193, - "grad_norm": 2.13781528126699, + "grad_norm": 2.0119250002751063, "learning_rate": 9.934861844245123e-06, - "loss": 0.2188, + "loss": 0.2161, "step": 376 }, { "epoch": 0.05157318741450068, - "grad_norm": 1.9610837438877449, + "grad_norm": 1.9074782995622095, "learning_rate": 9.934515663240161e-06, - "loss": 0.2334, + "loss": 0.235, "step": 377 }, { "epoch": 0.05170998632010944, - "grad_norm": 1.878248092546771, + "grad_norm": 1.7923253898070062, "learning_rate": 9.934168570834166e-06, - "loss": 0.2202, + "loss": 0.2217, "step": 378 }, { "epoch": 0.051846785225718194, - "grad_norm": 2.0348780415310057, + "grad_norm": 1.932161062129736, "learning_rate": 9.933820567091244e-06, - "loss": 0.2845, + "loss": 0.2775, "step": 379 }, { "epoch": 0.05198358413132695, - "grad_norm": 1.6480047241258025, + "grad_norm": 1.65428446892274, "learning_rate": 9.933471652075673e-06, - "loss": 0.2186, + "loss": 0.2185, "step": 380 }, { "epoch": 0.052120383036935704, - "grad_norm": 1.9844998739577888, + "grad_norm": 1.9222428969098972, "learning_rate": 9.933121825851896e-06, - "loss": 0.2587, + "loss": 0.256, "step": 381 }, { "epoch": 0.05225718194254446, - "grad_norm": 1.9328091383056798, + "grad_norm": 1.8685398345629887, "learning_rate": 9.932771088484528e-06, - "loss": 0.2685, + "loss": 0.2682, "step": 382 }, { "epoch": 0.052393980848153214, - "grad_norm": 1.902267525117071, + "grad_norm": 1.9063802690819556, "learning_rate": 9.932419440038348e-06, - "loss": 0.2653, + "loss": 0.2708, "step": 383 }, { "epoch": 0.05253077975376197, - "grad_norm": 1.5567090323835877, + "grad_norm": 1.5366119272979006, "learning_rate": 9.932066880578304e-06, - "loss": 0.1833, + "loss": 0.1865, "step": 384 }, { "epoch": 0.052667578659370724, - "grad_norm": 1.7126506198637272, + "grad_norm": 1.617992464250059, "learning_rate": 9.931713410169515e-06, - "loss": 0.2214, + "loss": 0.2197, "step": 385 }, { "epoch": 0.05280437756497948, - "grad_norm": 1.6682339188950361, + "grad_norm": 1.612974839752367, "learning_rate": 9.931359028877268e-06, - "loss": 0.221, + "loss": 0.219, "step": 386 }, { "epoch": 0.052941176470588235, - "grad_norm": 2.1552618689783376, + "grad_norm": 2.098022545562184, "learning_rate": 9.931003736767013e-06, - "loss": 0.2763, + "loss": 0.2758, "step": 387 }, { "epoch": 0.05307797537619699, - "grad_norm": 1.8565578824767337, + "grad_norm": 1.769578973486012, "learning_rate": 9.930647533904377e-06, - "loss": 0.2137, + "loss": 0.2132, "step": 388 }, { "epoch": 0.053214774281805745, - "grad_norm": 1.4955953615644402, + "grad_norm": 1.4806503805221316, "learning_rate": 9.930290420355147e-06, - "loss": 0.2023, + "loss": 0.2021, "step": 389 }, { "epoch": 0.0533515731874145, - "grad_norm": 2.0627834916790144, + "grad_norm": 2.0031623874848834, "learning_rate": 9.929932396185282e-06, - "loss": 0.2717, + "loss": 0.2732, "step": 390 }, { "epoch": 0.053488372093023255, - "grad_norm": 1.7254619575726622, + "grad_norm": 1.6078039723798214, "learning_rate": 9.92957346146091e-06, - "loss": 0.1993, + "loss": 0.1939, "step": 391 }, { "epoch": 0.05362517099863201, - "grad_norm": 1.6036634463276447, + "grad_norm": 1.512347934892868, "learning_rate": 9.929213616248325e-06, - "loss": 0.1918, + "loss": 0.1893, "step": 392 }, { "epoch": 0.053761969904240765, - "grad_norm": 2.1026934852795542, + "grad_norm": 1.978124139193322, "learning_rate": 9.928852860613992e-06, - "loss": 0.2511, + "loss": 0.2483, "step": 393 }, { "epoch": 0.05389876880984952, - "grad_norm": 1.9624555498891612, + "grad_norm": 1.8964376393070868, "learning_rate": 9.92849119462454e-06, - "loss": 0.2139, + "loss": 0.2122, "step": 394 }, { "epoch": 0.054035567715458276, - "grad_norm": 1.6640256885817828, + "grad_norm": 1.6031783802638058, "learning_rate": 9.928128618346768e-06, - "loss": 0.1862, + "loss": 0.1841, "step": 395 }, { "epoch": 0.05417236662106703, - "grad_norm": 2.0004562382315694, + "grad_norm": 1.8686790653929288, "learning_rate": 9.927765131847644e-06, - "loss": 0.2549, + "loss": 0.2531, "step": 396 }, { "epoch": 0.054309165526675786, - "grad_norm": 2.0034287840388876, + "grad_norm": 1.949845040411865, "learning_rate": 9.927400735194306e-06, - "loss": 0.2379, + "loss": 0.2399, "step": 397 }, { "epoch": 0.05444596443228454, - "grad_norm": 1.631616712945304, + "grad_norm": 1.5994289356118738, "learning_rate": 9.927035428454056e-06, - "loss": 0.2016, + "loss": 0.2025, "step": 398 }, { "epoch": 0.054582763337893296, - "grad_norm": 1.7251661258685886, + "grad_norm": 1.668770850504487, "learning_rate": 9.926669211694367e-06, - "loss": 0.2279, + "loss": 0.2312, "step": 399 }, { "epoch": 0.05471956224350205, - "grad_norm": 1.9783733581889387, + "grad_norm": 1.865584140937034, "learning_rate": 9.926302084982876e-06, - "loss": 0.2508, + "loss": 0.2466, "step": 400 }, { "epoch": 0.05471956224350205, - "eval_loss": 0.22886702418327332, - "eval_runtime": 5.9101, - "eval_samples_per_second": 5.076, + "eval_loss": 0.23273536562919617, + "eval_runtime": 5.9074, + "eval_samples_per_second": 5.078, "eval_steps_per_second": 1.354, "step": 400 }, { "epoch": 0.054856361149110806, - "grad_norm": 2.8581333099656607, + "grad_norm": 2.7567185745723224, "learning_rate": 9.925934048387393e-06, - "loss": 0.3753, + "loss": 0.3787, "step": 401 }, { "epoch": 0.05499316005471956, - "grad_norm": 1.785197144630871, + "grad_norm": 1.7280415276933512, "learning_rate": 9.925565101975894e-06, - "loss": 0.2282, + "loss": 0.2296, "step": 402 }, { "epoch": 0.05512995896032832, - "grad_norm": 2.0696398471103974, + "grad_norm": 1.9548266644524048, "learning_rate": 9.925195245816523e-06, - "loss": 0.2082, + "loss": 0.2055, "step": 403 }, { "epoch": 0.05526675786593707, - "grad_norm": 1.7059912680107066, + "grad_norm": 1.5899151013602735, "learning_rate": 9.924824479977594e-06, - "loss": 0.2086, + "loss": 0.2076, "step": 404 }, { "epoch": 0.05540355677154583, - "grad_norm": 1.855132558346605, + "grad_norm": 1.69316286868124, "learning_rate": 9.924452804527582e-06, - "loss": 0.2014, + "loss": 0.2062, "step": 405 }, { "epoch": 0.05554035567715458, - "grad_norm": 1.752199125131758, + "grad_norm": 1.9164499463858637, "learning_rate": 9.924080219535142e-06, - "loss": 0.2269, + "loss": 0.2277, "step": 406 }, { "epoch": 0.05567715458276334, - "grad_norm": 1.9165818295697479, + "grad_norm": 2.4307770188306588, "learning_rate": 9.923706725069084e-06, - "loss": 0.2235, + "loss": 0.2251, "step": 407 }, { "epoch": 0.05581395348837209, - "grad_norm": 2.002341471135681, + "grad_norm": 1.908482431833718, "learning_rate": 9.923332321198396e-06, - "loss": 0.2768, + "loss": 0.2757, "step": 408 }, { "epoch": 0.05595075239398085, - "grad_norm": 1.9192904422040038, + "grad_norm": 1.8579683791238957, "learning_rate": 9.922957007992229e-06, - "loss": 0.236, + "loss": 0.2341, "step": 409 }, { "epoch": 0.0560875512995896, - "grad_norm": 2.081120231855, + "grad_norm": 1.9372886837488377, "learning_rate": 9.922580785519903e-06, - "loss": 0.2363, + "loss": 0.2289, "step": 410 }, { "epoch": 0.05622435020519836, - "grad_norm": 1.98362065697721, + "grad_norm": 3.4869251638604526, "learning_rate": 9.922203653850905e-06, - "loss": 0.2939, + "loss": 0.2968, "step": 411 }, { "epoch": 0.05636114911080711, - "grad_norm": 1.9580021078922085, + "grad_norm": 1.9243865282318406, "learning_rate": 9.921825613054892e-06, - "loss": 0.2422, + "loss": 0.2423, "step": 412 }, { "epoch": 0.05649794801641587, - "grad_norm": 1.7272681630819917, + "grad_norm": 1.6761647867111908, "learning_rate": 9.921446663201689e-06, - "loss": 0.2324, + "loss": 0.2275, "step": 413 }, { "epoch": 0.05663474692202462, - "grad_norm": 1.986884453967158, + "grad_norm": 2.042188603014469, "learning_rate": 9.921066804361285e-06, - "loss": 0.2637, + "loss": 0.2684, "step": 414 }, { "epoch": 0.05677154582763338, - "grad_norm": 1.781780568089038, + "grad_norm": 1.8075400076867985, "learning_rate": 9.92068603660384e-06, - "loss": 0.2126, + "loss": 0.2155, "step": 415 }, { "epoch": 0.05690834473324213, - "grad_norm": 1.7912551260549756, + "grad_norm": 1.7969115547634351, "learning_rate": 9.920304359999683e-06, - "loss": 0.1943, + "loss": 0.1976, "step": 416 }, { "epoch": 0.05704514363885089, - "grad_norm": 2.201845098417249, + "grad_norm": 2.1476606697535523, "learning_rate": 9.91992177461931e-06, - "loss": 0.2824, + "loss": 0.2826, "step": 417 }, { "epoch": 0.057181942544459644, - "grad_norm": 1.786908813911672, + "grad_norm": 1.710045332922965, "learning_rate": 9.919538280533383e-06, - "loss": 0.2147, + "loss": 0.2121, "step": 418 }, { "epoch": 0.0573187414500684, - "grad_norm": 1.7524577355303892, + "grad_norm": 1.666405106144868, "learning_rate": 9.919153877812733e-06, - "loss": 0.2467, + "loss": 0.2449, "step": 419 }, { "epoch": 0.057455540355677154, - "grad_norm": 2.2290707840293122, + "grad_norm": 2.074206397148189, "learning_rate": 9.91876856652836e-06, - "loss": 0.2534, + "loss": 0.2469, "step": 420 }, { "epoch": 0.05759233926128591, - "grad_norm": 1.6334039685794808, + "grad_norm": 1.516839796218695, "learning_rate": 9.918382346751427e-06, - "loss": 0.1989, + "loss": 0.1919, "step": 421 }, { "epoch": 0.057729138166894664, - "grad_norm": 2.0405392519561185, + "grad_norm": 2.0317187866129984, "learning_rate": 9.917995218553271e-06, - "loss": 0.2505, + "loss": 0.2536, "step": 422 }, { "epoch": 0.05786593707250342, - "grad_norm": 2.159958596986065, + "grad_norm": 1.987586622909427, "learning_rate": 9.917607182005395e-06, - "loss": 0.238, + "loss": 0.2358, "step": 423 }, { "epoch": 0.058002735978112174, - "grad_norm": 1.89664643841249, + "grad_norm": 1.8421284194350551, "learning_rate": 9.91721823717947e-06, - "loss": 0.2286, + "loss": 0.2379, "step": 424 }, { "epoch": 0.05813953488372093, - "grad_norm": 2.1404842453389126, + "grad_norm": 2.022718828318074, "learning_rate": 9.91682838414733e-06, - "loss": 0.2446, + "loss": 0.2379, "step": 425 }, { "epoch": 0.058276333789329685, - "grad_norm": 1.716524476022525, + "grad_norm": 1.6832107651693244, "learning_rate": 9.916437622980986e-06, - "loss": 0.2196, + "loss": 0.2176, "step": 426 }, { "epoch": 0.05841313269493844, - "grad_norm": 1.972684650731968, + "grad_norm": 1.8889971310486597, "learning_rate": 9.916045953752606e-06, - "loss": 0.2293, + "loss": 0.2272, "step": 427 }, { "epoch": 0.058549931600547195, - "grad_norm": 1.6176909855443908, + "grad_norm": 1.507424075154607, "learning_rate": 9.915653376534533e-06, - "loss": 0.201, + "loss": 0.2017, "step": 428 }, { "epoch": 0.05868673050615595, - "grad_norm": 1.687136668265228, + "grad_norm": 1.6622026971512858, "learning_rate": 9.915259891399275e-06, - "loss": 0.1794, + "loss": 0.1799, "step": 429 }, { "epoch": 0.058823529411764705, - "grad_norm": 2.1324188806559783, + "grad_norm": 2.075213310453518, "learning_rate": 9.91486549841951e-06, - "loss": 0.2445, + "loss": 0.2475, "step": 430 }, { "epoch": 0.05896032831737346, - "grad_norm": 1.9487560398958121, + "grad_norm": 1.8554718957683278, "learning_rate": 9.91447019766808e-06, - "loss": 0.2467, + "loss": 0.2464, "step": 431 }, { "epoch": 0.059097127222982215, - "grad_norm": 2.065863707461707, + "grad_norm": 1.9426038636449774, "learning_rate": 9.914073989218e-06, - "loss": 0.2589, + "loss": 0.2568, "step": 432 }, { "epoch": 0.05923392612859097, - "grad_norm": 1.8787403661643707, + "grad_norm": 1.8119890898665947, "learning_rate": 9.913676873142445e-06, - "loss": 0.2337, + "loss": 0.2359, "step": 433 }, { "epoch": 0.059370725034199726, - "grad_norm": 1.5671306426673228, + "grad_norm": 1.4910901528244072, "learning_rate": 9.913278849514764e-06, - "loss": 0.2055, + "loss": 0.2069, "step": 434 }, { "epoch": 0.05950752393980848, - "grad_norm": 1.512817848584503, + "grad_norm": 1.4498926241185, "learning_rate": 9.912879918408474e-06, - "loss": 0.2077, + "loss": 0.2099, "step": 435 }, { "epoch": 0.059644322845417236, - "grad_norm": 1.8235028764959031, + "grad_norm": 1.767554081800395, "learning_rate": 9.912480079897256e-06, - "loss": 0.2161, + "loss": 0.2156, "step": 436 }, { "epoch": 0.05978112175102599, - "grad_norm": 1.6244097441117127, + "grad_norm": 1.5730894136080393, "learning_rate": 9.912079334054956e-06, - "loss": 0.2052, + "loss": 0.2063, "step": 437 }, { "epoch": 0.059917920656634746, - "grad_norm": 2.4231088407897583, + "grad_norm": 2.4381387168907995, "learning_rate": 9.911677680955596e-06, - "loss": 0.2512, + "loss": 0.2564, "step": 438 }, { "epoch": 0.0600547195622435, - "grad_norm": 1.7574578034836492, + "grad_norm": 1.6788032801426636, "learning_rate": 9.91127512067336e-06, - "loss": 0.2337, + "loss": 0.2341, "step": 439 }, { "epoch": 0.060191518467852256, - "grad_norm": 1.6484604459596892, + "grad_norm": 1.5974143900217677, "learning_rate": 9.9108716532826e-06, - "loss": 0.1877, + "loss": 0.1889, "step": 440 }, { "epoch": 0.06032831737346101, - "grad_norm": 2.0925754555819935, + "grad_norm": 2.0440488404170187, "learning_rate": 9.910467278857833e-06, - "loss": 0.2226, + "loss": 0.2236, "step": 441 }, { "epoch": 0.06046511627906977, - "grad_norm": 1.6752078607007748, + "grad_norm": 1.616205095387952, "learning_rate": 9.910061997473753e-06, - "loss": 0.2131, + "loss": 0.2137, "step": 442 }, { "epoch": 0.06060191518467852, - "grad_norm": 2.074806451105823, + "grad_norm": 2.0665375760713522, "learning_rate": 9.90965580920521e-06, - "loss": 0.215, + "loss": 0.2158, "step": 443 }, { "epoch": 0.06073871409028728, - "grad_norm": 1.8571049204301138, + "grad_norm": 2.52111361456609, "learning_rate": 9.90924871412723e-06, - "loss": 0.2281, + "loss": 0.2279, "step": 444 }, { "epoch": 0.06087551299589603, - "grad_norm": 2.003155153240984, + "grad_norm": 1.9708595550846577, "learning_rate": 9.908840712315e-06, - "loss": 0.2775, + "loss": 0.2752, "step": 445 }, { "epoch": 0.06101231190150479, - "grad_norm": 1.851227405230817, + "grad_norm": 1.821989895033981, "learning_rate": 9.90843180384388e-06, - "loss": 0.2178, + "loss": 0.2262, "step": 446 }, { "epoch": 0.06114911080711354, - "grad_norm": 1.845526188391099, + "grad_norm": 1.7944697998593697, "learning_rate": 9.908021988789396e-06, - "loss": 0.239, + "loss": 0.2426, "step": 447 }, { "epoch": 0.0612859097127223, - "grad_norm": 1.8224053136699445, + "grad_norm": 1.7354473258040826, "learning_rate": 9.907611267227236e-06, - "loss": 0.2254, + "loss": 0.2229, "step": 448 }, { "epoch": 0.06142270861833105, - "grad_norm": 1.8976508190803891, + "grad_norm": 1.8483683340390835, "learning_rate": 9.907199639233264e-06, - "loss": 0.2161, + "loss": 0.2194, "step": 449 }, { "epoch": 0.06155950752393981, - "grad_norm": 1.5779934419191446, + "grad_norm": 1.5496611828632472, "learning_rate": 9.906787104883507e-06, - "loss": 0.2314, + "loss": 0.2319, "step": 450 }, { "epoch": 0.06169630642954856, - "grad_norm": 2.2692003180136755, + "grad_norm": 2.2014679268558925, "learning_rate": 9.906373664254157e-06, - "loss": 0.3109, + "loss": 0.3177, "step": 451 }, { "epoch": 0.06183310533515732, - "grad_norm": 1.6643433698431338, + "grad_norm": 1.6388014024922601, "learning_rate": 9.90595931742158e-06, - "loss": 0.254, + "loss": 0.255, "step": 452 }, { "epoch": 0.06196990424076607, - "grad_norm": 1.7656122417798767, + "grad_norm": 1.6536510162006606, "learning_rate": 9.905544064462303e-06, - "loss": 0.2222, + "loss": 0.2167, "step": 453 }, { "epoch": 0.06210670314637483, - "grad_norm": 1.6123314104328215, + "grad_norm": 1.60664598018791, "learning_rate": 9.905127905453023e-06, - "loss": 0.1932, + "loss": 0.1983, "step": 454 }, { "epoch": 0.06224350205198358, - "grad_norm": 1.789097802063292, + "grad_norm": 1.6592323867360455, "learning_rate": 9.904710840470604e-06, - "loss": 0.2394, + "loss": 0.2381, "step": 455 }, { "epoch": 0.06238030095759234, - "grad_norm": 1.7259991425866192, + "grad_norm": 1.872765179631603, "learning_rate": 9.904292869592078e-06, - "loss": 0.243, + "loss": 0.2415, "step": 456 }, { "epoch": 0.0625170998632011, - "grad_norm": 1.8087420372049128, + "grad_norm": 1.7794979886526592, "learning_rate": 9.903873992894646e-06, - "loss": 0.2413, + "loss": 0.242, "step": 457 }, { "epoch": 0.06265389876880985, - "grad_norm": 1.876771353241759, + "grad_norm": 1.8212991157251766, "learning_rate": 9.90345421045567e-06, - "loss": 0.2549, + "loss": 0.2577, "step": 458 }, { "epoch": 0.06279069767441861, - "grad_norm": 1.7516532369619333, + "grad_norm": 1.5503229837455117, "learning_rate": 9.903033522352688e-06, - "loss": 0.2178, + "loss": 0.2186, "step": 459 }, { "epoch": 0.06292749658002736, - "grad_norm": 1.6057820766104627, + "grad_norm": 1.5643674006578494, "learning_rate": 9.902611928663397e-06, - "loss": 0.2174, + "loss": 0.221, "step": 460 }, { "epoch": 0.06306429548563612, - "grad_norm": 1.954155236819017, + "grad_norm": 1.7765174602895792, "learning_rate": 9.902189429465667e-06, - "loss": 0.2595, + "loss": 0.2541, "step": 461 }, { "epoch": 0.06320109439124487, - "grad_norm": 1.8406196241837098, + "grad_norm": 1.8193131635429836, "learning_rate": 9.901766024837531e-06, - "loss": 0.2542, + "loss": 0.2569, "step": 462 }, { "epoch": 0.06333789329685363, - "grad_norm": 1.6049160471182125, + "grad_norm": 1.5779044980760706, "learning_rate": 9.901341714857196e-06, - "loss": 0.2415, + "loss": 0.2433, "step": 463 }, { "epoch": 0.06347469220246238, - "grad_norm": 1.703829745722414, + "grad_norm": 1.6447433443879431, "learning_rate": 9.900916499603028e-06, - "loss": 0.1921, + "loss": 0.1946, "step": 464 }, { "epoch": 0.06361149110807114, - "grad_norm": 1.6224885230377295, + "grad_norm": 1.582552835587538, "learning_rate": 9.900490379153564e-06, - "loss": 0.2032, + "loss": 0.2044, "step": 465 }, { "epoch": 0.06374829001367989, - "grad_norm": 1.7692057036540296, + "grad_norm": 1.7013338493443397, "learning_rate": 9.90006335358751e-06, - "loss": 0.2145, + "loss": 0.216, "step": 466 }, { "epoch": 0.06388508891928865, - "grad_norm": 1.9967755693363685, + "grad_norm": 1.9190447146147782, "learning_rate": 9.899635422983737e-06, - "loss": 0.2127, + "loss": 0.2155, "step": 467 }, { "epoch": 0.0640218878248974, - "grad_norm": 2.0303645982239846, + "grad_norm": 2.0041616913305322, "learning_rate": 9.899206587421282e-06, - "loss": 0.304, + "loss": 0.31, "step": 468 }, { "epoch": 0.06415868673050616, - "grad_norm": 1.7139683188106403, + "grad_norm": 1.6394183236764095, "learning_rate": 9.89877684697935e-06, - "loss": 0.179, + "loss": 0.1798, "step": 469 }, { "epoch": 0.06429548563611491, - "grad_norm": 1.6244571903052492, + "grad_norm": 1.5798216445401976, "learning_rate": 9.898346201737316e-06, - "loss": 0.2111, + "loss": 0.2117, "step": 470 }, { "epoch": 0.06443228454172367, - "grad_norm": 2.041683682724829, + "grad_norm": 1.9606456269514088, "learning_rate": 9.89791465177472e-06, - "loss": 0.234, + "loss": 0.232, "step": 471 }, { "epoch": 0.06456908344733242, - "grad_norm": 1.8248966357081884, + "grad_norm": 1.76508634589713, "learning_rate": 9.897482197171266e-06, - "loss": 0.2668, + "loss": 0.2691, "step": 472 }, { "epoch": 0.06470588235294118, - "grad_norm": 2.0612189510660994, + "grad_norm": 2.0440792524451727, "learning_rate": 9.89704883800683e-06, - "loss": 0.3008, + "loss": 0.3046, "step": 473 }, { "epoch": 0.06484268125854993, - "grad_norm": 2.1145809576081582, + "grad_norm": 2.0017024050136465, "learning_rate": 9.896614574361453e-06, - "loss": 0.2871, + "loss": 0.2881, "step": 474 }, { "epoch": 0.06497948016415869, - "grad_norm": 1.9810723027733117, + "grad_norm": 1.8989420316106036, "learning_rate": 9.896179406315343e-06, - "loss": 0.2266, + "loss": 0.2261, "step": 475 }, { "epoch": 0.06511627906976744, - "grad_norm": 1.7156426299978882, + "grad_norm": 1.6484438076435521, "learning_rate": 9.895743333948875e-06, - "loss": 0.2385, + "loss": 0.2408, "step": 476 }, { "epoch": 0.0652530779753762, - "grad_norm": 1.9550056900262958, + "grad_norm": 1.7584071917412187, "learning_rate": 9.895306357342592e-06, - "loss": 0.2439, + "loss": 0.2395, "step": 477 }, { "epoch": 0.06538987688098495, - "grad_norm": 1.6620256146460122, + "grad_norm": 1.5959416666218427, "learning_rate": 9.894868476577202e-06, - "loss": 0.2103, + "loss": 0.2073, "step": 478 }, { "epoch": 0.06552667578659371, - "grad_norm": 2.4345490723041845, + "grad_norm": 2.2354731975891404, "learning_rate": 9.894429691733582e-06, - "loss": 0.2972, + "loss": 0.2978, "step": 479 }, { "epoch": 0.06566347469220246, - "grad_norm": 2.0038912294168285, + "grad_norm": 1.9818248290752734, "learning_rate": 9.893990002892777e-06, - "loss": 0.2389, + "loss": 0.2436, "step": 480 }, { "epoch": 0.06580027359781122, - "grad_norm": 1.8075184408850815, + "grad_norm": 1.7626286308374022, "learning_rate": 9.893549410135993e-06, - "loss": 0.2023, + "loss": 0.2007, "step": 481 }, { "epoch": 0.06593707250341997, - "grad_norm": 1.7947205980543426, + "grad_norm": 1.7505617213841516, "learning_rate": 9.893107913544608e-06, - "loss": 0.2447, + "loss": 0.2423, "step": 482 }, { "epoch": 0.06607387140902873, - "grad_norm": 1.5049050665285502, + "grad_norm": 1.4560193800911287, "learning_rate": 9.89266551320017e-06, - "loss": 0.2349, + "loss": 0.2329, "step": 483 }, { "epoch": 0.06621067031463748, - "grad_norm": 2.1075412812571312, + "grad_norm": 2.037654137562333, "learning_rate": 9.892222209184385e-06, - "loss": 0.2299, + "loss": 0.2312, "step": 484 }, { "epoch": 0.06634746922024624, - "grad_norm": 1.9314782484987434, + "grad_norm": 1.9274013506371173, "learning_rate": 9.891778001579136e-06, - "loss": 0.2508, + "loss": 0.2527, "step": 485 }, { "epoch": 0.06648426812585499, - "grad_norm": 1.8582601428548466, + "grad_norm": 1.7817754207723555, "learning_rate": 9.891332890466463e-06, - "loss": 0.2364, + "loss": 0.2367, "step": 486 }, { "epoch": 0.06662106703146375, - "grad_norm": 1.7812883075694022, + "grad_norm": 1.7007418677842785, "learning_rate": 9.89088687592858e-06, - "loss": 0.2392, + "loss": 0.2357, "step": 487 }, { "epoch": 0.0667578659370725, - "grad_norm": 2.3029158159562826, + "grad_norm": 2.266765102885715, "learning_rate": 9.890439958047866e-06, - "loss": 0.2636, + "loss": 0.2642, "step": 488 }, { "epoch": 0.06689466484268126, - "grad_norm": 1.6152495020279019, + "grad_norm": 1.5827242337245142, "learning_rate": 9.889992136906865e-06, - "loss": 0.2233, + "loss": 0.2253, "step": 489 }, { "epoch": 0.06703146374829001, - "grad_norm": 2.036846912911183, + "grad_norm": 2.0000490798397257, "learning_rate": 9.88954341258829e-06, - "loss": 0.2907, + "loss": 0.2931, "step": 490 }, { "epoch": 0.06716826265389877, - "grad_norm": 2.064729979329912, + "grad_norm": 2.017221758122002, "learning_rate": 9.889093785175021e-06, - "loss": 0.2279, + "loss": 0.227, "step": 491 }, { "epoch": 0.06730506155950752, - "grad_norm": 1.6781982726490907, + "grad_norm": 1.6488154363021028, "learning_rate": 9.8886432547501e-06, - "loss": 0.2401, + "loss": 0.2431, "step": 492 }, { "epoch": 0.06744186046511629, - "grad_norm": 1.6473300815208265, + "grad_norm": 1.6047987611282373, "learning_rate": 9.888191821396745e-06, - "loss": 0.1851, + "loss": 0.1865, "step": 493 }, { "epoch": 0.06757865937072503, - "grad_norm": 1.818703184122374, + "grad_norm": 1.8257637177623556, "learning_rate": 9.887739485198331e-06, - "loss": 0.2374, + "loss": 0.2442, "step": 494 }, { "epoch": 0.0677154582763338, - "grad_norm": 1.7658719406247336, + "grad_norm": 1.707579184904592, "learning_rate": 9.887286246238406e-06, - "loss": 0.2332, + "loss": 0.2325, "step": 495 }, { "epoch": 0.06785225718194254, - "grad_norm": 1.2696337318547726, + "grad_norm": 1.2316946207925867, "learning_rate": 9.886832104600684e-06, - "loss": 0.1798, + "loss": 0.178, "step": 496 }, { "epoch": 0.0679890560875513, - "grad_norm": 1.774866148252157, + "grad_norm": 1.6938992274976736, "learning_rate": 9.88637706036904e-06, - "loss": 0.2473, + "loss": 0.2481, "step": 497 }, { "epoch": 0.06812585499316005, - "grad_norm": 1.6927052917384386, + "grad_norm": 1.6664470504795394, "learning_rate": 9.885921113627526e-06, - "loss": 0.2057, + "loss": 0.2016, "step": 498 }, { "epoch": 0.06826265389876882, - "grad_norm": 1.836598894950964, + "grad_norm": 1.942522684529482, "learning_rate": 9.885464264460351e-06, - "loss": 0.2214, + "loss": 0.2218, "step": 499 }, { "epoch": 0.06839945280437756, - "grad_norm": 1.7434519573527343, + "grad_norm": 1.7025687926306226, "learning_rate": 9.885006512951898e-06, - "loss": 0.2277, + "loss": 0.2261, "step": 500 }, { "epoch": 0.06839945280437756, - "eval_loss": 0.22736431658267975, - "eval_runtime": 5.9291, - "eval_samples_per_second": 5.06, - "eval_steps_per_second": 1.349, + "eval_loss": 0.22786223888397217, + "eval_runtime": 5.9189, + "eval_samples_per_second": 5.069, + "eval_steps_per_second": 1.352, "step": 500 }, { "epoch": 0.06853625170998633, - "grad_norm": 1.3702313811269649, + "grad_norm": 1.6727294950749814, "learning_rate": 9.88454785918671e-06, - "loss": 0.1794, + "loss": 0.1918, "step": 501 }, { "epoch": 0.06867305061559507, - "grad_norm": 1.7060903673535097, + "grad_norm": 1.6495841010658423, "learning_rate": 9.884088303249502e-06, - "loss": 0.2496, + "loss": 0.2486, "step": 502 }, { "epoch": 0.06880984952120384, - "grad_norm": 1.5886523359445441, + "grad_norm": 1.4798844821112973, "learning_rate": 9.883627845225154e-06, - "loss": 0.2185, + "loss": 0.2145, "step": 503 }, { "epoch": 0.06894664842681258, - "grad_norm": 2.013162502300225, + "grad_norm": 1.9402847783285062, "learning_rate": 9.88316648519871e-06, - "loss": 0.2311, + "loss": 0.2326, "step": 504 }, { "epoch": 0.06908344733242135, - "grad_norm": 2.0645801171850615, + "grad_norm": 1.9079171044974812, "learning_rate": 9.882704223255383e-06, - "loss": 0.2314, + "loss": 0.2248, "step": 505 }, { "epoch": 0.0692202462380301, - "grad_norm": 1.9132709391106784, + "grad_norm": 1.8306606472354485, "learning_rate": 9.882241059480555e-06, - "loss": 0.268, + "loss": 0.2667, "step": 506 }, { "epoch": 0.06935704514363886, - "grad_norm": 1.989393166914204, + "grad_norm": 1.9474766793299232, "learning_rate": 9.88177699395977e-06, - "loss": 0.2775, + "loss": 0.2794, "step": 507 }, { "epoch": 0.0694938440492476, - "grad_norm": 2.0446836209392942, + "grad_norm": 1.9335619679721128, "learning_rate": 9.881312026778743e-06, - "loss": 0.2717, + "loss": 0.2651, "step": 508 }, { "epoch": 0.06963064295485637, - "grad_norm": 1.976451621099582, + "grad_norm": 1.9402947723047566, "learning_rate": 9.880846158023349e-06, - "loss": 0.2558, + "loss": 0.2626, "step": 509 }, { "epoch": 0.06976744186046512, - "grad_norm": 1.4816703992029079, + "grad_norm": 1.4189522215879504, "learning_rate": 9.880379387779637e-06, - "loss": 0.1992, + "loss": 0.201, "step": 510 }, { "epoch": 0.06990424076607388, - "grad_norm": 1.8504909385544783, + "grad_norm": 1.8336179315594254, "learning_rate": 9.879911716133816e-06, - "loss": 0.2548, + "loss": 0.2569, "step": 511 }, { "epoch": 0.07004103967168263, - "grad_norm": 1.7915294844668435, + "grad_norm": 1.7779804028539927, "learning_rate": 9.879443143172268e-06, - "loss": 0.2354, + "loss": 0.232, "step": 512 }, { "epoch": 0.07017783857729139, - "grad_norm": 1.5432806780120163, + "grad_norm": 1.520451015634253, "learning_rate": 9.878973668981535e-06, - "loss": 0.2015, + "loss": 0.2026, "step": 513 }, { "epoch": 0.07031463748290014, - "grad_norm": 1.8210415531457491, + "grad_norm": 1.7541609458386078, "learning_rate": 9.878503293648332e-06, - "loss": 0.2262, + "loss": 0.2272, "step": 514 }, { "epoch": 0.0704514363885089, - "grad_norm": 1.5722966959255618, + "grad_norm": 1.5343691507566601, "learning_rate": 9.878032017259533e-06, - "loss": 0.2059, + "loss": 0.2037, "step": 515 }, { "epoch": 0.07058823529411765, - "grad_norm": 1.601554131677742, + "grad_norm": 1.5666555550452745, "learning_rate": 9.877559839902185e-06, - "loss": 0.2008, + "loss": 0.2001, "step": 516 }, { "epoch": 0.07072503419972641, - "grad_norm": 2.048706271037731, + "grad_norm": 2.0038102602748236, "learning_rate": 9.877086761663497e-06, - "loss": 0.188, + "loss": 0.1869, "step": 517 }, { "epoch": 0.07086183310533516, - "grad_norm": 1.8845760921309362, + "grad_norm": 1.8512555108789084, "learning_rate": 9.876612782630848e-06, - "loss": 0.2649, + "loss": 0.2655, "step": 518 }, { "epoch": 0.07099863201094392, - "grad_norm": 1.5976152629369742, + "grad_norm": 1.4563926382118793, "learning_rate": 9.87613790289178e-06, - "loss": 0.1907, + "loss": 0.1852, "step": 519 }, { "epoch": 0.07113543091655267, - "grad_norm": 1.8183961404634257, + "grad_norm": 1.7842473918885573, "learning_rate": 9.875662122534004e-06, - "loss": 0.2575, + "loss": 0.2588, "step": 520 }, { "epoch": 0.07127222982216143, - "grad_norm": 1.7849194463621936, + "grad_norm": 1.7930900340350735, "learning_rate": 9.875185441645393e-06, - "loss": 0.1713, + "loss": 0.1684, "step": 521 }, { "epoch": 0.07140902872777018, - "grad_norm": 1.2597980732185607, + "grad_norm": 1.241424694149292, "learning_rate": 9.874707860313997e-06, - "loss": 0.1581, + "loss": 0.1608, "step": 522 }, { "epoch": 0.07154582763337894, - "grad_norm": 1.8394485164709717, + "grad_norm": 1.7828883844977417, "learning_rate": 9.874229378628017e-06, - "loss": 0.2038, + "loss": 0.2036, "step": 523 }, { "epoch": 0.07168262653898769, - "grad_norm": 1.7263373229751255, + "grad_norm": 1.684105073822339, "learning_rate": 9.873749996675835e-06, - "loss": 0.2186, + "loss": 0.2182, "step": 524 }, { "epoch": 0.07181942544459645, - "grad_norm": 1.8238981305054933, + "grad_norm": 1.7657439150016896, "learning_rate": 9.873269714545986e-06, - "loss": 0.1899, + "loss": 0.189, "step": 525 }, { "epoch": 0.0719562243502052, - "grad_norm": 1.9338678747396145, + "grad_norm": 1.8947110111750491, "learning_rate": 9.87278853232718e-06, - "loss": 0.2857, + "loss": 0.2829, "step": 526 }, { "epoch": 0.07209302325581396, - "grad_norm": 1.8326186642974125, + "grad_norm": 1.7977902606705904, "learning_rate": 9.872306450108294e-06, - "loss": 0.2347, + "loss": 0.2374, "step": 527 }, { "epoch": 0.07222982216142271, - "grad_norm": 1.833068241764649, + "grad_norm": 1.7584874758171003, "learning_rate": 9.871823467978363e-06, - "loss": 0.2153, + "loss": 0.2134, "step": 528 }, { "epoch": 0.07236662106703147, - "grad_norm": 1.7243080663722081, + "grad_norm": 1.6825270693976233, "learning_rate": 9.871339586026599e-06, - "loss": 0.2292, + "loss": 0.2277, "step": 529 }, { "epoch": 0.07250341997264022, - "grad_norm": 1.6607342091820694, + "grad_norm": 1.6346355612574908, "learning_rate": 9.87085480434237e-06, - "loss": 0.2383, + "loss": 0.2381, "step": 530 }, { "epoch": 0.07264021887824898, - "grad_norm": 1.9174347049294527, + "grad_norm": 1.8481158170881742, "learning_rate": 9.870369123015218e-06, - "loss": 0.238, + "loss": 0.236, "step": 531 }, { "epoch": 0.07277701778385773, - "grad_norm": 1.541973682091802, + "grad_norm": 1.5468464733112033, "learning_rate": 9.869882542134845e-06, - "loss": 0.1898, + "loss": 0.1918, "step": 532 }, { "epoch": 0.07291381668946649, - "grad_norm": 1.8117177290189, + "grad_norm": 1.736269431523409, "learning_rate": 9.869395061791125e-06, - "loss": 0.2332, + "loss": 0.2317, "step": 533 }, { "epoch": 0.07305061559507524, - "grad_norm": 2.1103363011957987, + "grad_norm": 2.0247862741548683, "learning_rate": 9.868906682074093e-06, - "loss": 0.2589, + "loss": 0.2553, "step": 534 }, { "epoch": 0.073187414500684, - "grad_norm": 1.9540777433247989, + "grad_norm": 1.8713117320321935, "learning_rate": 9.868417403073954e-06, - "loss": 0.2237, + "loss": 0.2254, "step": 535 }, { "epoch": 0.07332421340629275, - "grad_norm": 1.6387040570492746, + "grad_norm": 1.5819160649080546, "learning_rate": 9.867927224881076e-06, - "loss": 0.2071, + "loss": 0.2066, "step": 536 }, { "epoch": 0.07346101231190151, - "grad_norm": 1.5942755099744979, + "grad_norm": 1.6088136275398257, "learning_rate": 9.867436147585994e-06, - "loss": 0.22, + "loss": 0.2222, "step": 537 }, { "epoch": 0.07359781121751026, - "grad_norm": 1.5538705682735963, + "grad_norm": 1.5318182336357677, "learning_rate": 9.86694417127941e-06, - "loss": 0.1896, + "loss": 0.1901, "step": 538 }, { "epoch": 0.07373461012311902, - "grad_norm": 1.5724300671304232, + "grad_norm": 1.494743410298761, "learning_rate": 9.866451296052195e-06, - "loss": 0.2369, + "loss": 0.2348, "step": 539 }, { "epoch": 0.07387140902872777, - "grad_norm": 1.621436558124791, + "grad_norm": 1.7083503422850506, "learning_rate": 9.86595752199538e-06, - "loss": 0.2074, + "loss": 0.2029, "step": 540 }, { "epoch": 0.07400820793433653, - "grad_norm": 2.031092099875125, + "grad_norm": 2.023016167539391, "learning_rate": 9.865462849200162e-06, - "loss": 0.266, + "loss": 0.2724, "step": 541 }, { "epoch": 0.07414500683994528, - "grad_norm": 1.5735181012713775, + "grad_norm": 1.5228887165802611, "learning_rate": 9.864967277757912e-06, - "loss": 0.2428, + "loss": 0.2453, "step": 542 }, { "epoch": 0.07428180574555404, - "grad_norm": 1.6908231223656223, + "grad_norm": 1.6555172151654396, "learning_rate": 9.864470807760158e-06, - "loss": 0.2132, + "loss": 0.2129, "step": 543 }, { "epoch": 0.07441860465116279, - "grad_norm": 1.5413047892167515, + "grad_norm": 1.5313092883364332, "learning_rate": 9.863973439298597e-06, - "loss": 0.2041, + "loss": 0.2071, "step": 544 }, { "epoch": 0.07455540355677155, - "grad_norm": 2.0020004121758084, + "grad_norm": 1.919632081980858, "learning_rate": 9.863475172465096e-06, - "loss": 0.2798, + "loss": 0.2795, "step": 545 }, { "epoch": 0.0746922024623803, - "grad_norm": 1.7196183263251703, + "grad_norm": 1.6711837634244102, "learning_rate": 9.862976007351683e-06, - "loss": 0.2268, + "loss": 0.2271, "step": 546 }, { "epoch": 0.07482900136798906, - "grad_norm": 1.6646040584154491, + "grad_norm": 1.5910949439100024, "learning_rate": 9.862475944050552e-06, - "loss": 0.1984, + "loss": 0.196, "step": 547 }, { "epoch": 0.07496580027359781, - "grad_norm": 2.0011103222082736, + "grad_norm": 1.8771335271978133, "learning_rate": 9.861974982654066e-06, - "loss": 0.2037, + "loss": 0.2001, "step": 548 }, { "epoch": 0.07510259917920657, - "grad_norm": 1.6554336732787194, + "grad_norm": 1.5827428981728568, "learning_rate": 9.86147312325475e-06, - "loss": 0.2164, + "loss": 0.2152, "step": 549 }, { "epoch": 0.07523939808481532, - "grad_norm": 1.8134413125050852, + "grad_norm": 1.7267062073529955, "learning_rate": 9.8609703659453e-06, - "loss": 0.2318, + "loss": 0.2289, "step": 550 }, { "epoch": 0.07537619699042408, - "grad_norm": 1.9236208413858562, + "grad_norm": 1.916699952580338, "learning_rate": 9.860466710818572e-06, - "loss": 0.2067, + "loss": 0.209, "step": 551 }, { "epoch": 0.07551299589603283, - "grad_norm": 1.7933561567664171, + "grad_norm": 1.7241426952243946, "learning_rate": 9.859962157967592e-06, - "loss": 0.2441, + "loss": 0.2406, "step": 552 }, { "epoch": 0.07564979480164159, - "grad_norm": 1.9857152315347706, + "grad_norm": 1.8682450476478625, "learning_rate": 9.85945670748555e-06, - "loss": 0.2093, + "loss": 0.206, "step": 553 }, { "epoch": 0.07578659370725034, - "grad_norm": 2.0629929814202903, + "grad_norm": 1.988740938247853, "learning_rate": 9.858950359465805e-06, - "loss": 0.291, + "loss": 0.2962, "step": 554 }, { "epoch": 0.0759233926128591, - "grad_norm": 1.7946757524412904, + "grad_norm": 1.8107202243956315, "learning_rate": 9.858443114001876e-06, - "loss": 0.2332, + "loss": 0.2326, "step": 555 }, { "epoch": 0.07606019151846785, - "grad_norm": 1.7391475157014948, + "grad_norm": 1.6949224179761908, "learning_rate": 9.857934971187451e-06, - "loss": 0.2256, + "loss": 0.2247, "step": 556 }, { "epoch": 0.07619699042407661, - "grad_norm": 1.2271317358178264, + "grad_norm": 1.188441400409048, "learning_rate": 9.857425931116385e-06, - "loss": 0.1684, + "loss": 0.1663, "step": 557 }, { "epoch": 0.07633378932968536, - "grad_norm": 1.7313142759347167, + "grad_norm": 2.6458932010432923, "learning_rate": 9.856915993882697e-06, - "loss": 0.2647, + "loss": 0.2658, "step": 558 }, { "epoch": 0.07647058823529412, - "grad_norm": 1.6306574076858849, + "grad_norm": 1.6686079296433975, "learning_rate": 9.85640515958057e-06, - "loss": 0.2186, + "loss": 0.2187, "step": 559 }, { "epoch": 0.07660738714090287, - "grad_norm": 1.6642179532249577, + "grad_norm": 2.0572843171532984, "learning_rate": 9.855893428304357e-06, - "loss": 0.2061, + "loss": 0.207, "step": 560 }, { "epoch": 0.07674418604651163, - "grad_norm": 1.5020659223845405, + "grad_norm": 1.4793112117663088, "learning_rate": 9.855380800148573e-06, - "loss": 0.2027, + "loss": 0.2058, "step": 561 }, { "epoch": 0.07688098495212038, - "grad_norm": 1.6390851016399617, + "grad_norm": 1.5948149299110592, "learning_rate": 9.854867275207902e-06, - "loss": 0.1911, + "loss": 0.1948, "step": 562 }, { "epoch": 0.07701778385772914, - "grad_norm": 1.6849875436316049, + "grad_norm": 1.6229719970364545, "learning_rate": 9.854352853577189e-06, - "loss": 0.208, + "loss": 0.2062, "step": 563 }, { "epoch": 0.07715458276333789, - "grad_norm": 1.5632487847837113, + "grad_norm": 1.4664079436463433, "learning_rate": 9.85383753535145e-06, - "loss": 0.1777, + "loss": 0.1754, "step": 564 }, { "epoch": 0.07729138166894665, - "grad_norm": 1.6602810178935399, + "grad_norm": 1.6358915151109152, "learning_rate": 9.853321320625859e-06, - "loss": 0.2337, + "loss": 0.2358, "step": 565 }, { "epoch": 0.0774281805745554, - "grad_norm": 1.6650364733431975, + "grad_norm": 1.6414339538749199, "learning_rate": 9.852804209495767e-06, - "loss": 0.2427, + "loss": 0.2422, "step": 566 }, { "epoch": 0.07756497948016416, - "grad_norm": 1.8339609615230217, + "grad_norm": 1.762602160730121, "learning_rate": 9.85228620205668e-06, - "loss": 0.2495, + "loss": 0.2496, "step": 567 }, { "epoch": 0.07770177838577291, - "grad_norm": 1.6187353006828493, + "grad_norm": 1.5302470584900227, "learning_rate": 9.851767298404274e-06, - "loss": 0.1956, + "loss": 0.194, "step": 568 }, { "epoch": 0.07783857729138167, - "grad_norm": 1.9248988820101873, + "grad_norm": 1.7988343229917911, "learning_rate": 9.85124749863439e-06, - "loss": 0.2505, + "loss": 0.251, "step": 569 }, { "epoch": 0.07797537619699042, - "grad_norm": 2.295965059890238, + "grad_norm": 2.17716153849456, "learning_rate": 9.850726802843035e-06, - "loss": 0.2944, + "loss": 0.2935, "step": 570 }, { "epoch": 0.07811217510259919, - "grad_norm": 1.345689964567265, + "grad_norm": 1.3072889714453677, "learning_rate": 9.850205211126382e-06, - "loss": 0.188, + "loss": 0.1878, "step": 571 }, { "epoch": 0.07824897400820793, - "grad_norm": 2.0045471513781954, + "grad_norm": 1.9612087876735649, "learning_rate": 9.849682723580767e-06, - "loss": 0.2357, + "loss": 0.235, "step": 572 }, { "epoch": 0.0783857729138167, - "grad_norm": 1.4644762032130496, + "grad_norm": 1.375363216687806, "learning_rate": 9.849159340302693e-06, - "loss": 0.2248, + "loss": 0.2203, "step": 573 }, { "epoch": 0.07852257181942544, - "grad_norm": 1.7871025069724737, + "grad_norm": 1.7325801654123163, "learning_rate": 9.84863506138883e-06, - "loss": 0.2361, + "loss": 0.2343, "step": 574 }, { "epoch": 0.0786593707250342, - "grad_norm": 1.8898837374743425, + "grad_norm": 1.833114488661515, "learning_rate": 9.848109886936011e-06, - "loss": 0.2128, + "loss": 0.2134, "step": 575 }, { "epoch": 0.07879616963064295, - "grad_norm": 1.632111472425079, + "grad_norm": 1.5699996293863705, "learning_rate": 9.847583817041236e-06, - "loss": 0.2029, + "loss": 0.1982, "step": 576 }, { "epoch": 0.07893296853625172, - "grad_norm": 1.4922091556889032, + "grad_norm": 1.4983517976669802, "learning_rate": 9.847056851801669e-06, - "loss": 0.2509, + "loss": 0.2556, "step": 577 }, { "epoch": 0.07906976744186046, - "grad_norm": 1.8640093037365288, + "grad_norm": 1.8137576876973274, "learning_rate": 9.846528991314638e-06, - "loss": 0.2615, + "loss": 0.2621, "step": 578 }, { "epoch": 0.07920656634746923, - "grad_norm": 1.8903586647473387, + "grad_norm": 1.8413457225774699, "learning_rate": 9.846000235677641e-06, - "loss": 0.2398, + "loss": 0.2437, "step": 579 }, { "epoch": 0.07934336525307797, - "grad_norm": 1.9673543061246412, + "grad_norm": 1.909265839000232, "learning_rate": 9.84547058498834e-06, - "loss": 0.2269, + "loss": 0.2254, "step": 580 }, { "epoch": 0.07948016415868674, - "grad_norm": 1.717458006918791, + "grad_norm": 1.605700698617295, "learning_rate": 9.844940039344559e-06, - "loss": 0.2093, + "loss": 0.2077, "step": 581 }, { "epoch": 0.07961696306429548, - "grad_norm": 1.7402180780176824, + "grad_norm": 1.6765312174352156, "learning_rate": 9.844408598844288e-06, - "loss": 0.2537, + "loss": 0.2515, "step": 582 }, { "epoch": 0.07975376196990425, - "grad_norm": 1.8334807916348714, + "grad_norm": 1.8197521234253355, "learning_rate": 9.843876263585686e-06, - "loss": 0.2368, + "loss": 0.238, "step": 583 }, { "epoch": 0.079890560875513, - "grad_norm": 2.144928026336056, + "grad_norm": 2.0585246907175767, "learning_rate": 9.843343033667075e-06, - "loss": 0.2625, + "loss": 0.2631, "step": 584 }, { "epoch": 0.08002735978112176, - "grad_norm": 1.6665144394251206, + "grad_norm": 1.6504850329304024, "learning_rate": 9.842808909186941e-06, - "loss": 0.2312, + "loss": 0.2335, "step": 585 }, { "epoch": 0.0801641586867305, - "grad_norm": 1.8989767072980464, + "grad_norm": 1.8647611988501558, "learning_rate": 9.842273890243937e-06, - "loss": 0.2216, + "loss": 0.2204, "step": 586 }, { "epoch": 0.08030095759233927, - "grad_norm": 1.133126194825308, + "grad_norm": 1.083939779932134, "learning_rate": 9.841737976936878e-06, - "loss": 0.1555, + "loss": 0.1548, "step": 587 }, { "epoch": 0.08043775649794802, - "grad_norm": 1.844616341333044, + "grad_norm": 1.7638163294090585, "learning_rate": 9.841201169364752e-06, - "loss": 0.2385, + "loss": 0.2349, "step": 588 }, { "epoch": 0.08057455540355678, - "grad_norm": 2.2267439364281936, + "grad_norm": 2.1670802136651113, "learning_rate": 9.840663467626701e-06, - "loss": 0.2914, + "loss": 0.2926, "step": 589 }, { "epoch": 0.08071135430916553, - "grad_norm": 1.7562066527056628, + "grad_norm": 1.7181471300158149, "learning_rate": 9.84012487182204e-06, - "loss": 0.2727, + "loss": 0.2752, "step": 590 }, { "epoch": 0.08084815321477429, - "grad_norm": 2.2262949359317883, + "grad_norm": 2.1525356605573163, "learning_rate": 9.839585382050252e-06, - "loss": 0.2325, + "loss": 0.23, "step": 591 }, { "epoch": 0.08098495212038304, - "grad_norm": 2.1479942298226797, + "grad_norm": 2.1309874659294663, "learning_rate": 9.839044998410975e-06, - "loss": 0.253, + "loss": 0.2498, "step": 592 }, { "epoch": 0.0811217510259918, - "grad_norm": 2.1428778022381088, + "grad_norm": 2.080867865362261, "learning_rate": 9.838503721004018e-06, - "loss": 0.2603, + "loss": 0.2588, "step": 593 }, { "epoch": 0.08125854993160055, - "grad_norm": 1.6872772664597662, + "grad_norm": 1.638071279733904, "learning_rate": 9.837961549929356e-06, - "loss": 0.2059, + "loss": 0.2068, "step": 594 }, { "epoch": 0.08139534883720931, - "grad_norm": 1.6728314708659842, + "grad_norm": 1.652610601851791, "learning_rate": 9.837418485287126e-06, - "loss": 0.2021, + "loss": 0.201, "step": 595 }, { "epoch": 0.08153214774281806, - "grad_norm": 1.973759255102812, + "grad_norm": 1.937181496981849, "learning_rate": 9.836874527177634e-06, - "loss": 0.2718, + "loss": 0.273, "step": 596 }, { "epoch": 0.08166894664842682, - "grad_norm": 1.640543714476281, + "grad_norm": 1.5908159462511497, "learning_rate": 9.836329675701347e-06, - "loss": 0.2105, + "loss": 0.2119, "step": 597 }, { "epoch": 0.08180574555403557, - "grad_norm": 2.0612907091329404, + "grad_norm": 2.034695589961769, "learning_rate": 9.835783930958898e-06, - "loss": 0.259, + "loss": 0.2567, "step": 598 }, { "epoch": 0.08194254445964433, - "grad_norm": 1.759117958879062, + "grad_norm": 1.7198287751685295, "learning_rate": 9.835237293051087e-06, - "loss": 0.2205, + "loss": 0.2207, "step": 599 }, { "epoch": 0.08207934336525308, - "grad_norm": 1.668993706860523, + "grad_norm": 1.6010544723749696, "learning_rate": 9.834689762078877e-06, - "loss": 0.2448, + "loss": 0.2394, "step": 600 }, { "epoch": 0.08207934336525308, - "eval_loss": 0.2225474715232849, - "eval_runtime": 5.9357, - "eval_samples_per_second": 5.054, - "eval_steps_per_second": 1.348, + "eval_loss": 0.22314941883087158, + "eval_runtime": 5.9233, + "eval_samples_per_second": 5.065, + "eval_steps_per_second": 1.351, "step": 600 }, { "epoch": 0.08221614227086184, - "grad_norm": 1.8069442554516932, + "grad_norm": 1.7721571257610964, "learning_rate": 9.834141338143396e-06, - "loss": 0.2428, + "loss": 0.2396, "step": 601 }, { "epoch": 0.08235294117647059, - "grad_norm": 1.7255474444689296, + "grad_norm": 1.7105163074682146, "learning_rate": 9.833592021345938e-06, - "loss": 0.1987, + "loss": 0.2005, "step": 602 }, { "epoch": 0.08248974008207935, - "grad_norm": 1.50470760064842, + "grad_norm": 1.4519245880782705, "learning_rate": 9.833041811787962e-06, "loss": 0.1638, "step": 603 }, { "epoch": 0.0826265389876881, - "grad_norm": 1.9283470222522472, + "grad_norm": 1.915126169768455, "learning_rate": 9.83249070957109e-06, - "loss": 0.283, + "loss": 0.2873, "step": 604 }, { "epoch": 0.08276333789329686, - "grad_norm": 1.503144869302895, + "grad_norm": 1.4650077019128886, "learning_rate": 9.83193871479711e-06, - "loss": 0.2066, + "loss": 0.2054, "step": 605 }, { "epoch": 0.08290013679890561, - "grad_norm": 1.6288070074387992, + "grad_norm": 1.5881722820558002, "learning_rate": 9.831385827567977e-06, - "loss": 0.2206, + "loss": 0.2213, "step": 606 }, { "epoch": 0.08303693570451437, - "grad_norm": 1.8557434255790677, + "grad_norm": 1.8303291948069642, "learning_rate": 9.830832047985807e-06, - "loss": 0.2586, + "loss": 0.2587, "step": 607 }, { "epoch": 0.08317373461012312, - "grad_norm": 1.8208869997036012, + "grad_norm": 1.749546400148718, "learning_rate": 9.830277376152882e-06, - "loss": 0.29, + "loss": 0.2903, "step": 608 }, { "epoch": 0.08331053351573188, - "grad_norm": 1.8966810231218867, + "grad_norm": 1.8301222450647883, "learning_rate": 9.829721812171652e-06, - "loss": 0.246, + "loss": 0.2427, "step": 609 }, { "epoch": 0.08344733242134063, - "grad_norm": 1.49751684608449, + "grad_norm": 1.4757459368160109, "learning_rate": 9.829165356144728e-06, - "loss": 0.1898, + "loss": 0.187, "step": 610 }, { "epoch": 0.08358413132694939, - "grad_norm": 1.836533055281913, + "grad_norm": 1.7327898554585281, "learning_rate": 9.828608008174886e-06, - "loss": 0.2577, + "loss": 0.256, "step": 611 }, { "epoch": 0.08372093023255814, - "grad_norm": 1.74653655270626, + "grad_norm": 1.7264538237748386, "learning_rate": 9.82804976836507e-06, - "loss": 0.2182, + "loss": 0.215, "step": 612 }, { "epoch": 0.0838577291381669, - "grad_norm": 1.8824643103053251, + "grad_norm": 1.8375244782185316, "learning_rate": 9.827490636818382e-06, - "loss": 0.2163, + "loss": 0.2131, "step": 613 }, { "epoch": 0.08399452804377565, - "grad_norm": 1.8556782624287576, + "grad_norm": 1.7704069817573478, "learning_rate": 9.826930613638099e-06, - "loss": 0.2344, + "loss": 0.2328, "step": 614 }, { "epoch": 0.08413132694938441, - "grad_norm": 1.8842739819247993, + "grad_norm": 1.8650843698844872, "learning_rate": 9.82636969892765e-06, - "loss": 0.2489, + "loss": 0.2463, "step": 615 }, { "epoch": 0.08426812585499316, - "grad_norm": 1.454181164434132, + "grad_norm": 1.423041411179295, "learning_rate": 9.825807892790644e-06, - "loss": 0.2081, + "loss": 0.2061, "step": 616 }, { "epoch": 0.08440492476060192, - "grad_norm": 1.6882004220730185, + "grad_norm": 1.6319539941702585, "learning_rate": 9.82524519533084e-06, - "loss": 0.2227, + "loss": 0.2181, "step": 617 }, { "epoch": 0.08454172366621067, - "grad_norm": 1.7291753358343753, + "grad_norm": 1.6789266436431027, "learning_rate": 9.824681606652168e-06, - "loss": 0.1894, + "loss": 0.1895, "step": 618 }, { "epoch": 0.08467852257181943, - "grad_norm": 1.6179323359272348, + "grad_norm": 1.62341831548431, "learning_rate": 9.824117126858725e-06, - "loss": 0.2332, + "loss": 0.229, "step": 619 }, { "epoch": 0.08481532147742818, - "grad_norm": 1.569789776815793, + "grad_norm": 1.5424838806307632, "learning_rate": 9.823551756054768e-06, - "loss": 0.1909, + "loss": 0.1883, "step": 620 }, { "epoch": 0.08495212038303694, - "grad_norm": 1.6653583071251996, + "grad_norm": 1.6614309559715248, "learning_rate": 9.822985494344721e-06, - "loss": 0.2289, + "loss": 0.2336, "step": 621 }, { "epoch": 0.08508891928864569, - "grad_norm": 1.713596277260556, + "grad_norm": 1.6666333274734249, "learning_rate": 9.822418341833173e-06, - "loss": 0.2397, + "loss": 0.2368, "step": 622 }, { "epoch": 0.08522571819425445, - "grad_norm": 1.415260373326621, + "grad_norm": 1.434280159523829, "learning_rate": 9.821850298624876e-06, - "loss": 0.2131, + "loss": 0.2132, "step": 623 }, { "epoch": 0.0853625170998632, - "grad_norm": 2.686774600482178, + "grad_norm": 2.7697632693871683, "learning_rate": 9.821281364824746e-06, - "loss": 0.3473, + "loss": 0.3598, "step": 624 }, { "epoch": 0.08549931600547196, - "grad_norm": 1.8934898757880179, + "grad_norm": 1.902800855503782, "learning_rate": 9.820711540537866e-06, - "loss": 0.2228, + "loss": 0.2279, "step": 625 }, { "epoch": 0.08563611491108071, - "grad_norm": 1.8672168018820445, + "grad_norm": 1.807294205993208, "learning_rate": 9.82014082586948e-06, - "loss": 0.2522, + "loss": 0.2579, "step": 626 }, { "epoch": 0.08577291381668947, - "grad_norm": 1.6634733956110248, + "grad_norm": 1.6334606613971678, "learning_rate": 9.819569220925002e-06, - "loss": 0.2566, + "loss": 0.2578, "step": 627 }, { "epoch": 0.08590971272229822, - "grad_norm": 2.1511942366640304, + "grad_norm": 2.1158366698356934, "learning_rate": 9.818996725810004e-06, - "loss": 0.2283, + "loss": 0.2259, "step": 628 }, { "epoch": 0.08604651162790698, - "grad_norm": 1.6050334183966672, + "grad_norm": 1.6009104608472855, "learning_rate": 9.81842334063023e-06, - "loss": 0.2223, + "loss": 0.2268, "step": 629 }, { "epoch": 0.08618331053351573, - "grad_norm": 1.5652038672169288, + "grad_norm": 1.5032900765662187, "learning_rate": 9.817849065491576e-06, - "loss": 0.2141, + "loss": 0.2121, "step": 630 }, { "epoch": 0.08632010943912449, - "grad_norm": 1.541415695017267, + "grad_norm": 1.509719439172504, "learning_rate": 9.817273900500118e-06, - "loss": 0.2434, + "loss": 0.2424, "step": 631 }, { "epoch": 0.08645690834473324, - "grad_norm": 1.583465055388016, + "grad_norm": 1.5366222330993842, "learning_rate": 9.816697845762082e-06, - "loss": 0.2208, + "loss": 0.217, "step": 632 }, { "epoch": 0.086593707250342, - "grad_norm": 1.6460619285973952, + "grad_norm": 1.6069421195710107, "learning_rate": 9.816120901383869e-06, - "loss": 0.2128, + "loss": 0.2137, "step": 633 }, { "epoch": 0.08673050615595075, - "grad_norm": 1.852819121000503, + "grad_norm": 1.7852320551039602, "learning_rate": 9.815543067472038e-06, - "loss": 0.2628, + "loss": 0.2604, "step": 634 }, { "epoch": 0.08686730506155951, - "grad_norm": 1.9169198410232888, + "grad_norm": 1.8996088618632767, "learning_rate": 9.814964344133318e-06, - "loss": 0.2373, + "loss": 0.2375, "step": 635 }, { "epoch": 0.08700410396716826, - "grad_norm": 1.632822201982149, + "grad_norm": 1.6013138556188726, "learning_rate": 9.814384731474594e-06, - "loss": 0.1944, + "loss": 0.1904, "step": 636 }, { "epoch": 0.08714090287277702, - "grad_norm": 1.7087594784763438, + "grad_norm": 1.6621682759123355, "learning_rate": 9.813804229602922e-06, - "loss": 0.199, + "loss": 0.2011, "step": 637 }, { "epoch": 0.08727770177838577, - "grad_norm": 1.2689062806349054, + "grad_norm": 1.2710995155098046, "learning_rate": 9.813222838625522e-06, - "loss": 0.2157, + "loss": 0.2149, "step": 638 }, { "epoch": 0.08741450068399453, - "grad_norm": 1.3472351802749498, + "grad_norm": 1.3290184375504166, "learning_rate": 9.812640558649773e-06, - "loss": 0.1819, + "loss": 0.1809, "step": 639 }, { "epoch": 0.08755129958960328, - "grad_norm": 1.6817078651824835, + "grad_norm": 1.6236978439804408, "learning_rate": 9.812057389783225e-06, - "loss": 0.2402, + "loss": 0.2405, "step": 640 }, { "epoch": 0.08768809849521204, - "grad_norm": 1.7747172175576449, + "grad_norm": 1.7514843329515801, "learning_rate": 9.811473332133586e-06, - "loss": 0.241, + "loss": 0.2389, "step": 641 }, { "epoch": 0.08782489740082079, - "grad_norm": 1.9544474027651795, + "grad_norm": 1.9173213505171096, "learning_rate": 9.810888385808732e-06, - "loss": 0.1984, + "loss": 0.1964, "step": 642 }, { "epoch": 0.08796169630642955, - "grad_norm": 1.4968705106549605, + "grad_norm": 1.4804039583484654, "learning_rate": 9.810302550916704e-06, - "loss": 0.2021, + "loss": 0.2026, "step": 643 }, { "epoch": 0.0880984952120383, - "grad_norm": 1.802445034185475, + "grad_norm": 1.7391104885183188, "learning_rate": 9.809715827565702e-06, - "loss": 0.2452, + "loss": 0.242, "step": 644 }, { "epoch": 0.08823529411764706, - "grad_norm": 1.6272450247144128, + "grad_norm": 1.5725293105676372, "learning_rate": 9.809128215864096e-06, - "loss": 0.2394, + "loss": 0.2376, "step": 645 }, { "epoch": 0.08837209302325581, - "grad_norm": 1.0446112415214195, + "grad_norm": 1.0157553184047556, "learning_rate": 9.808539715920415e-06, - "loss": 0.1707, + "loss": 0.1711, "step": 646 }, { "epoch": 0.08850889192886457, - "grad_norm": 1.6170366176723232, + "grad_norm": 1.5345250208220202, "learning_rate": 9.807950327843358e-06, - "loss": 0.2202, + "loss": 0.2159, "step": 647 }, { "epoch": 0.08864569083447332, - "grad_norm": 1.3098081088310978, + "grad_norm": 1.2360523172617008, "learning_rate": 9.807360051741781e-06, - "loss": 0.2091, + "loss": 0.205, "step": 648 }, { "epoch": 0.08878248974008209, - "grad_norm": 1.611453013026908, + "grad_norm": 1.5665410893357832, "learning_rate": 9.80676888772471e-06, - "loss": 0.1962, + "loss": 0.1948, "step": 649 }, { "epoch": 0.08891928864569083, - "grad_norm": 1.4304391238486034, + "grad_norm": 1.416397135587069, "learning_rate": 9.806176835901329e-06, - "loss": 0.1765, + "loss": 0.1783, "step": 650 }, { "epoch": 0.0890560875512996, - "grad_norm": 1.4789911818233403, + "grad_norm": 1.4339588164196864, "learning_rate": 9.805583896380992e-06, - "loss": 0.1866, + "loss": 0.1843, "step": 651 }, { "epoch": 0.08919288645690834, - "grad_norm": 1.381481400916262, + "grad_norm": 1.3492511183622053, "learning_rate": 9.804990069273215e-06, - "loss": 0.1916, + "loss": 0.1918, "step": 652 }, { "epoch": 0.0893296853625171, - "grad_norm": 1.438135498413412, + "grad_norm": 1.3879909164829856, "learning_rate": 9.804395354687678e-06, - "loss": 0.2157, + "loss": 0.2184, "step": 653 }, { "epoch": 0.08946648426812585, - "grad_norm": 1.8547227131803943, + "grad_norm": 1.8379809869195285, "learning_rate": 9.80379975273422e-06, - "loss": 0.2555, + "loss": 0.2513, "step": 654 }, { "epoch": 0.08960328317373462, - "grad_norm": 1.6583261271329168, + "grad_norm": 1.6547507691800487, "learning_rate": 9.803203263522854e-06, - "loss": 0.2286, + "loss": 0.2309, "step": 655 }, { "epoch": 0.08974008207934336, - "grad_norm": 1.489666179248257, + "grad_norm": 1.4724709684391295, "learning_rate": 9.802605887163746e-06, - "loss": 0.2156, + "loss": 0.2196, "step": 656 }, { "epoch": 0.08987688098495213, - "grad_norm": 1.5050574790883542, + "grad_norm": 1.4756779701464247, "learning_rate": 9.802007623767234e-06, - "loss": 0.2148, + "loss": 0.2175, "step": 657 }, { "epoch": 0.09001367989056087, - "grad_norm": 1.5879239661317919, + "grad_norm": 1.5583679703793352, "learning_rate": 9.801408473443817e-06, - "loss": 0.2154, + "loss": 0.2197, "step": 658 }, { "epoch": 0.09015047879616964, - "grad_norm": 1.7286451752760614, + "grad_norm": 1.6804648203887615, "learning_rate": 9.800808436304155e-06, - "loss": 0.1992, + "loss": 0.2008, "step": 659 }, { "epoch": 0.09028727770177838, - "grad_norm": 1.4430354560393008, + "grad_norm": 1.424943928561739, "learning_rate": 9.800207512459076e-06, - "loss": 0.2209, + "loss": 0.2212, "step": 660 }, { "epoch": 0.09042407660738715, - "grad_norm": 1.6964558200342872, + "grad_norm": 1.6360375089841666, "learning_rate": 9.79960570201957e-06, - "loss": 0.2556, + "loss": 0.2538, "step": 661 }, { "epoch": 0.0905608755129959, - "grad_norm": 1.8413821925305704, + "grad_norm": 1.842723793351284, "learning_rate": 9.79900300509679e-06, - "loss": 0.2108, + "loss": 0.216, "step": 662 }, { "epoch": 0.09069767441860466, - "grad_norm": 1.7195883547597786, + "grad_norm": 1.6923627280444762, "learning_rate": 9.798399421802057e-06, - "loss": 0.2323, + "loss": 0.2334, "step": 663 }, { "epoch": 0.0908344733242134, - "grad_norm": 1.6722849851493438, + "grad_norm": 1.6232039895215358, "learning_rate": 9.797794952246848e-06, - "loss": 0.256, + "loss": 0.2588, "step": 664 }, { "epoch": 0.09097127222982217, - "grad_norm": 1.7741015055427682, + "grad_norm": 1.7717305113724007, "learning_rate": 9.797189596542809e-06, - "loss": 0.2145, + "loss": 0.2153, "step": 665 }, { "epoch": 0.09110807113543092, - "grad_norm": 1.6646969367244064, + "grad_norm": 1.6166012233451243, "learning_rate": 9.796583354801751e-06, - "loss": 0.2348, + "loss": 0.2341, "step": 666 }, { "epoch": 0.09124487004103968, - "grad_norm": 1.6484749901374531, + "grad_norm": 1.5890234600060098, "learning_rate": 9.795976227135646e-06, - "loss": 0.2049, + "loss": 0.2055, "step": 667 }, { "epoch": 0.09138166894664843, - "grad_norm": 1.8226074430684889, + "grad_norm": 1.7941154509445727, "learning_rate": 9.795368213656628e-06, - "loss": 0.2031, + "loss": 0.2014, "step": 668 }, { "epoch": 0.09151846785225719, - "grad_norm": 1.631885392781018, + "grad_norm": 1.5931770220446133, "learning_rate": 9.794759314476997e-06, - "loss": 0.2297, + "loss": 0.2284, "step": 669 }, { "epoch": 0.09165526675786594, - "grad_norm": 1.4303480640691846, + "grad_norm": 1.36432701830169, "learning_rate": 9.794149529709217e-06, - "loss": 0.1872, + "loss": 0.1844, "step": 670 }, { "epoch": 0.0917920656634747, - "grad_norm": 1.5567255848953188, + "grad_norm": 1.520504914616387, "learning_rate": 9.793538859465915e-06, - "loss": 0.1891, + "loss": 0.1875, "step": 671 }, { "epoch": 0.09192886456908345, - "grad_norm": 1.668987795489255, + "grad_norm": 1.612762383014748, "learning_rate": 9.79292730385988e-06, - "loss": 0.2342, + "loss": 0.2337, "step": 672 }, { "epoch": 0.09206566347469221, - "grad_norm": 1.5806970997532734, + "grad_norm": 1.5638238653507315, "learning_rate": 9.792314863004066e-06, - "loss": 0.1962, + "loss": 0.1944, "step": 673 }, { "epoch": 0.09220246238030096, - "grad_norm": 1.7707495504550843, + "grad_norm": 1.7147598012465795, "learning_rate": 9.791701537011591e-06, - "loss": 0.2161, + "loss": 0.2158, "step": 674 }, { "epoch": 0.09233926128590972, - "grad_norm": 1.8368593485018199, + "grad_norm": 1.8399346379672847, "learning_rate": 9.791087325995737e-06, - "loss": 0.2402, + "loss": 0.2442, "step": 675 }, { "epoch": 0.09247606019151847, - "grad_norm": 2.0070088904726737, + "grad_norm": 1.9893568010669325, "learning_rate": 9.790472230069948e-06, - "loss": 0.236, + "loss": 0.2389, "step": 676 }, { "epoch": 0.09261285909712723, - "grad_norm": 1.3516919130463005, + "grad_norm": 1.3122007156380084, "learning_rate": 9.789856249347828e-06, - "loss": 0.1799, + "loss": 0.178, "step": 677 }, { "epoch": 0.09274965800273598, - "grad_norm": 1.7716172776326131, + "grad_norm": 1.6960832467400062, "learning_rate": 9.789239383943151e-06, - "loss": 0.2101, + "loss": 0.2091, "step": 678 }, { "epoch": 0.09288645690834474, - "grad_norm": 1.8424457578383022, + "grad_norm": 1.812529731463494, "learning_rate": 9.788621633969853e-06, - "loss": 0.1985, + "loss": 0.1999, "step": 679 }, { "epoch": 0.09302325581395349, - "grad_norm": 1.5785092903249034, + "grad_norm": 1.4864904417223972, "learning_rate": 9.78800299954203e-06, - "loss": 0.2094, + "loss": 0.2071, "step": 680 }, { "epoch": 0.09316005471956225, - "grad_norm": 1.5304377674583047, + "grad_norm": 1.4779974118770784, "learning_rate": 9.787383480773945e-06, - "loss": 0.2109, + "loss": 0.2088, "step": 681 }, { "epoch": 0.093296853625171, - "grad_norm": 1.7316792024275214, + "grad_norm": 1.7094518879253, "learning_rate": 9.78676307778002e-06, - "loss": 0.2273, + "loss": 0.23, "step": 682 }, { "epoch": 0.09343365253077976, - "grad_norm": 1.625518074531849, + "grad_norm": 1.6070670438963075, "learning_rate": 9.786141790674843e-06, - "loss": 0.2256, + "loss": 0.2273, "step": 683 }, { "epoch": 0.09357045143638851, - "grad_norm": 1.5530488469254267, + "grad_norm": 1.5287552385407053, "learning_rate": 9.785519619573166e-06, - "loss": 0.2091, + "loss": 0.2085, "step": 684 }, { "epoch": 0.09370725034199727, - "grad_norm": 1.6567734299104788, + "grad_norm": 1.6677220801842012, "learning_rate": 9.784896564589905e-06, - "loss": 0.2047, + "loss": 0.2089, "step": 685 }, { "epoch": 0.09384404924760602, - "grad_norm": 1.7369999103414526, + "grad_norm": 1.7242679192017447, "learning_rate": 9.784272625840136e-06, - "loss": 0.2674, + "loss": 0.2678, "step": 686 }, { "epoch": 0.09398084815321478, - "grad_norm": 1.4790003572138493, + "grad_norm": 1.4649367455281648, "learning_rate": 9.783647803439103e-06, - "loss": 0.1884, + "loss": 0.1896, "step": 687 }, { "epoch": 0.09411764705882353, - "grad_norm": 1.7592674323287738, + "grad_norm": 1.7093672219101876, "learning_rate": 9.783022097502204e-06, - "loss": 0.247, + "loss": 0.2465, "step": 688 }, { "epoch": 0.09425444596443229, - "grad_norm": 1.7321377010142165, + "grad_norm": 1.7217578188023674, "learning_rate": 9.782395508145012e-06, - "loss": 0.2293, + "loss": 0.2317, "step": 689 }, { "epoch": 0.09439124487004104, - "grad_norm": 1.6486872583265402, + "grad_norm": 1.61253966840241, "learning_rate": 9.781768035483256e-06, - "loss": 0.2415, + "loss": 0.2396, "step": 690 }, { "epoch": 0.0945280437756498, - "grad_norm": 1.5551751920438375, + "grad_norm": 1.4890360579457023, "learning_rate": 9.78113967963283e-06, - "loss": 0.1727, + "loss": 0.173, "step": 691 }, { "epoch": 0.09466484268125855, - "grad_norm": 1.7294706083030746, + "grad_norm": 1.7061342821791594, "learning_rate": 9.78051044070979e-06, - "loss": 0.266, + "loss": 0.272, "step": 692 }, { "epoch": 0.09480164158686731, - "grad_norm": 1.4569876172357754, + "grad_norm": 1.4252762963336743, "learning_rate": 9.779880318830355e-06, - "loss": 0.2009, + "loss": 0.202, "step": 693 }, { "epoch": 0.09493844049247606, - "grad_norm": 1.9891015894471444, + "grad_norm": 1.9649072194978974, "learning_rate": 9.779249314110908e-06, - "loss": 0.2979, + "loss": 0.299, "step": 694 }, { "epoch": 0.09507523939808482, - "grad_norm": 1.6581779806812853, + "grad_norm": 1.620638298637588, "learning_rate": 9.778617426667998e-06, - "loss": 0.2339, + "loss": 0.23, "step": 695 }, { "epoch": 0.09521203830369357, - "grad_norm": 1.604956728139595, + "grad_norm": 1.5518693153025092, "learning_rate": 9.777984656618333e-06, - "loss": 0.2179, + "loss": 0.2236, "step": 696 }, { "epoch": 0.09534883720930233, - "grad_norm": 2.150508350291721, + "grad_norm": 2.082572267591988, "learning_rate": 9.777351004078784e-06, - "loss": 0.2604, + "loss": 0.2552, "step": 697 }, { "epoch": 0.09548563611491108, - "grad_norm": 1.4157949394907974, + "grad_norm": 1.3612983602749105, "learning_rate": 9.776716469166385e-06, - "loss": 0.2201, + "loss": 0.2181, "step": 698 }, { "epoch": 0.09562243502051984, - "grad_norm": 1.5248699010637903, + "grad_norm": 1.4302320621559312, "learning_rate": 9.776081051998337e-06, - "loss": 0.2287, + "loss": 0.2245, "step": 699 }, { "epoch": 0.09575923392612859, - "grad_norm": 1.528106579928435, + "grad_norm": 1.5068388265409012, "learning_rate": 9.775444752691998e-06, - "loss": 0.2281, + "loss": 0.228, "step": 700 }, { "epoch": 0.09575923392612859, - "eval_loss": 0.2244933843612671, - "eval_runtime": 5.9209, - "eval_samples_per_second": 5.067, + "eval_loss": 0.22367161512374878, + "eval_runtime": 5.9197, + "eval_samples_per_second": 5.068, "eval_steps_per_second": 1.351, "step": 700 }, { "epoch": 0.09589603283173735, - "grad_norm": 1.6849464823680838, + "grad_norm": 1.6046776152209379, "learning_rate": 9.774807571364896e-06, - "loss": 0.2464, + "loss": 0.2404, "step": 701 }, { "epoch": 0.0960328317373461, - "grad_norm": 1.8847843968675866, + "grad_norm": 1.8903341829546703, "learning_rate": 9.774169508134715e-06, - "loss": 0.2426, + "loss": 0.2406, "step": 702 }, { "epoch": 0.09616963064295486, - "grad_norm": 1.7010360737674473, + "grad_norm": 1.643343658795847, "learning_rate": 9.773530563119303e-06, - "loss": 0.2118, + "loss": 0.2109, "step": 703 }, { "epoch": 0.09630642954856361, - "grad_norm": 1.6085881886815068, + "grad_norm": 1.540466112381238, "learning_rate": 9.772890736436677e-06, - "loss": 0.2254, + "loss": 0.2227, "step": 704 }, { "epoch": 0.09644322845417237, - "grad_norm": 1.6586997608414018, + "grad_norm": 1.6201015214823047, "learning_rate": 9.772250028205009e-06, - "loss": 0.2097, + "loss": 0.2115, "step": 705 }, { "epoch": 0.09658002735978112, - "grad_norm": 1.6954737655867866, + "grad_norm": 1.681442814756743, "learning_rate": 9.77160843854264e-06, - "loss": 0.2499, + "loss": 0.2546, "step": 706 }, { "epoch": 0.09671682626538988, - "grad_norm": 1.9884480299073655, + "grad_norm": 1.9077833657259788, "learning_rate": 9.770965967568068e-06, - "loss": 0.2392, + "loss": 0.2385, "step": 707 }, { "epoch": 0.09685362517099863, - "grad_norm": 1.7119417089005242, + "grad_norm": 1.6569685736619415, "learning_rate": 9.77032261539996e-06, - "loss": 0.2388, + "loss": 0.2392, "step": 708 }, { "epoch": 0.09699042407660739, - "grad_norm": 1.6919189893931417, + "grad_norm": 1.6584396021031562, "learning_rate": 9.76967838215714e-06, - "loss": 0.2298, + "loss": 0.2302, "step": 709 }, { "epoch": 0.09712722298221614, - "grad_norm": 1.753788180788175, + "grad_norm": 1.6781831206561464, "learning_rate": 9.769033267958598e-06, - "loss": 0.2405, + "loss": 0.2361, "step": 710 }, { "epoch": 0.0972640218878249, - "grad_norm": 1.6245311334734964, + "grad_norm": 1.572789569027814, "learning_rate": 9.76838727292349e-06, - "loss": 0.2637, + "loss": 0.2613, "step": 711 }, { "epoch": 0.09740082079343365, - "grad_norm": 1.7227838726855915, + "grad_norm": 1.6576570137493778, "learning_rate": 9.767740397171124e-06, - "loss": 0.205, + "loss": 0.2014, "step": 712 }, { "epoch": 0.09753761969904241, - "grad_norm": 1.7404521970594216, + "grad_norm": 1.6985210885152453, "learning_rate": 9.76709264082098e-06, - "loss": 0.2464, + "loss": 0.2503, "step": 713 }, { "epoch": 0.09767441860465116, - "grad_norm": 1.5104095547040324, + "grad_norm": 1.4427402501677205, "learning_rate": 9.766444003992704e-06, - "loss": 0.2041, + "loss": 0.2015, "step": 714 }, { "epoch": 0.09781121751025992, - "grad_norm": 1.5413580137384326, + "grad_norm": 1.477913483684755, "learning_rate": 9.765794486806089e-06, - "loss": 0.2101, + "loss": 0.2133, "step": 715 }, { "epoch": 0.09794801641586867, - "grad_norm": 1.3656441771901773, + "grad_norm": 1.2899133664951563, "learning_rate": 9.765144089381106e-06, - "loss": 0.2087, + "loss": 0.2057, "step": 716 }, { "epoch": 0.09808481532147743, - "grad_norm": 1.421448089994845, + "grad_norm": 1.3376030748107193, "learning_rate": 9.764492811837882e-06, - "loss": 0.157, + "loss": 0.1558, "step": 717 }, { "epoch": 0.09822161422708618, - "grad_norm": 1.5887557079402714, + "grad_norm": 1.4873779852257267, "learning_rate": 9.763840654296706e-06, - "loss": 0.2149, + "loss": 0.211, "step": 718 }, { "epoch": 0.09835841313269494, - "grad_norm": 2.0398149404105577, + "grad_norm": 1.9986191012425747, "learning_rate": 9.763187616878033e-06, - "loss": 0.2185, + "loss": 0.2205, "step": 719 }, { "epoch": 0.09849521203830369, - "grad_norm": 1.6890799374186865, + "grad_norm": 1.589397118629039, "learning_rate": 9.76253369970248e-06, - "loss": 0.2271, + "loss": 0.224, "step": 720 }, { "epoch": 0.09863201094391245, - "grad_norm": 1.7823184508154113, + "grad_norm": 1.715794786904834, "learning_rate": 9.761878902890818e-06, - "loss": 0.2458, + "loss": 0.2444, "step": 721 }, { "epoch": 0.0987688098495212, - "grad_norm": 1.576434191810668, + "grad_norm": 1.5653574936630126, "learning_rate": 9.761223226563997e-06, - "loss": 0.231, + "loss": 0.2378, "step": 722 }, { "epoch": 0.09890560875512996, - "grad_norm": 1.8809520975887604, + "grad_norm": 1.8146134393472915, "learning_rate": 9.760566670843111e-06, - "loss": 0.2217, + "loss": 0.2222, "step": 723 }, { "epoch": 0.09904240766073871, - "grad_norm": 1.680874529075464, + "grad_norm": 1.6499827097744926, "learning_rate": 9.75990923584943e-06, - "loss": 0.2006, + "loss": 0.2034, "step": 724 }, { "epoch": 0.09917920656634747, - "grad_norm": 1.7942399516264176, + "grad_norm": 1.7084396813800942, "learning_rate": 9.759250921704382e-06, - "loss": 0.1973, + "loss": 0.1949, "step": 725 }, { "epoch": 0.09931600547195622, - "grad_norm": 1.5434938846046107, + "grad_norm": 1.511839192317881, "learning_rate": 9.758591728529555e-06, - "loss": 0.1965, + "loss": 0.1981, "step": 726 }, { "epoch": 0.09945280437756499, - "grad_norm": 1.7440381063620252, + "grad_norm": 1.7367823660161388, "learning_rate": 9.757931656446702e-06, - "loss": 0.2314, + "loss": 0.2324, "step": 727 }, { "epoch": 0.09958960328317373, - "grad_norm": 1.7486635406836066, + "grad_norm": 1.6906775644523933, "learning_rate": 9.757270705577739e-06, - "loss": 0.2254, + "loss": 0.2227, "step": 728 }, { "epoch": 0.0997264021887825, - "grad_norm": 1.7494424716544346, + "grad_norm": 1.6645452755150825, "learning_rate": 9.756608876044743e-06, - "loss": 0.1817, + "loss": 0.1842, "step": 729 }, { "epoch": 0.09986320109439124, - "grad_norm": 1.4027553802867323, + "grad_norm": 1.3378770226553485, "learning_rate": 9.755946167969952e-06, - "loss": 0.1955, + "loss": 0.1935, "step": 730 }, { "epoch": 0.1, - "grad_norm": 1.4419414136970066, + "grad_norm": 1.4003899731118896, "learning_rate": 9.755282581475769e-06, - "loss": 0.1928, + "loss": 0.1916, "step": 731 }, { "epoch": 0.10013679890560875, - "grad_norm": 2.027758226217731, + "grad_norm": 1.9813260106488046, "learning_rate": 9.754618116684756e-06, - "loss": 0.2956, + "loss": 0.2906, "step": 732 }, { "epoch": 0.10027359781121752, - "grad_norm": 1.7872688342886336, + "grad_norm": 1.6993774111789146, "learning_rate": 9.753952773719642e-06, - "loss": 0.2331, + "loss": 0.2352, "step": 733 }, { "epoch": 0.10041039671682626, - "grad_norm": 1.6197039443391485, + "grad_norm": 1.5897850981106951, "learning_rate": 9.753286552703312e-06, - "loss": 0.2445, + "loss": 0.2447, "step": 734 }, { "epoch": 0.10054719562243503, - "grad_norm": 2.1305831655455187, + "grad_norm": 2.025983318936702, "learning_rate": 9.752619453758818e-06, - "loss": 0.2479, + "loss": 0.2471, "step": 735 }, { "epoch": 0.10068399452804377, - "grad_norm": 1.9495980195451719, + "grad_norm": 1.8380663758193865, "learning_rate": 9.751951477009374e-06, - "loss": 0.2893, + "loss": 0.2921, "step": 736 }, { "epoch": 0.10082079343365254, - "grad_norm": 1.7259872217939596, + "grad_norm": 1.6630562276292973, "learning_rate": 9.751282622578352e-06, - "loss": 0.2167, + "loss": 0.2183, "step": 737 }, { "epoch": 0.10095759233926128, - "grad_norm": 1.5048644725023281, + "grad_norm": 1.4709680460785184, "learning_rate": 9.750612890589293e-06, - "loss": 0.2035, + "loss": 0.2044, "step": 738 }, { "epoch": 0.10109439124487005, - "grad_norm": 1.3894091148247218, + "grad_norm": 1.3840444879781153, "learning_rate": 9.749942281165891e-06, - "loss": 0.1696, + "loss": 0.1703, "step": 739 }, { "epoch": 0.1012311901504788, - "grad_norm": 1.3975046446916697, + "grad_norm": 1.3808124485142215, "learning_rate": 9.749270794432011e-06, - "loss": 0.1839, + "loss": 0.1819, "step": 740 }, { "epoch": 0.10136798905608756, - "grad_norm": 1.3643129442325994, + "grad_norm": 1.320787709123852, "learning_rate": 9.748598430511673e-06, - "loss": 0.2221, + "loss": 0.2237, "step": 741 }, { "epoch": 0.1015047879616963, - "grad_norm": 1.8495342328738986, + "grad_norm": 1.8198237713875722, "learning_rate": 9.747925189529064e-06, - "loss": 0.2637, + "loss": 0.2638, "step": 742 }, { "epoch": 0.10164158686730507, - "grad_norm": 1.7444031777486522, + "grad_norm": 1.7353636323647703, "learning_rate": 9.74725107160853e-06, - "loss": 0.2686, + "loss": 0.2699, "step": 743 }, { "epoch": 0.10177838577291382, - "grad_norm": 1.5468584650281008, + "grad_norm": 1.5154386348471423, "learning_rate": 9.746576076874581e-06, - "loss": 0.2257, + "loss": 0.2274, "step": 744 }, { "epoch": 0.10191518467852258, - "grad_norm": 1.77473639012442, + "grad_norm": 1.7408193182097254, "learning_rate": 9.745900205451888e-06, - "loss": 0.2389, + "loss": 0.2387, "step": 745 }, { "epoch": 0.10205198358413133, - "grad_norm": 1.386459158663684, + "grad_norm": 1.3509544591483609, "learning_rate": 9.745223457465282e-06, - "loss": 0.1908, + "loss": 0.1927, "step": 746 }, { "epoch": 0.10218878248974009, - "grad_norm": 1.426876417582623, + "grad_norm": 1.4070535440485956, "learning_rate": 9.74454583303976e-06, - "loss": 0.2136, + "loss": 0.2096, "step": 747 }, { "epoch": 0.10232558139534884, - "grad_norm": 1.5916219675908827, + "grad_norm": 1.5761498597453536, "learning_rate": 9.743867332300478e-06, - "loss": 0.2104, + "loss": 0.2116, "step": 748 }, { "epoch": 0.1024623803009576, - "grad_norm": 1.318374858363603, + "grad_norm": 1.2549207872706272, "learning_rate": 9.743187955372755e-06, - "loss": 0.2043, + "loss": 0.2027, "step": 749 }, { "epoch": 0.10259917920656635, - "grad_norm": 1.6347542784847329, + "grad_norm": 1.587506143670594, "learning_rate": 9.74250770238207e-06, - "loss": 0.2094, + "loss": 0.2073, "step": 750 }, { "epoch": 0.10273597811217511, - "grad_norm": 1.4868091117839635, + "grad_norm": 1.5053022928421789, "learning_rate": 9.741826573454067e-06, - "loss": 0.1953, + "loss": 0.1965, "step": 751 }, { "epoch": 0.10287277701778386, - "grad_norm": 1.6414047178257734, + "grad_norm": 1.6219578595005064, "learning_rate": 9.741144568714549e-06, - "loss": 0.2454, + "loss": 0.2456, "step": 752 }, { "epoch": 0.10300957592339262, - "grad_norm": 1.8569946516423717, + "grad_norm": 1.6192756018860235, "learning_rate": 9.740461688289482e-06, - "loss": 0.2284, + "loss": 0.2302, "step": 753 }, { "epoch": 0.10314637482900137, - "grad_norm": 1.630468405434894, + "grad_norm": 1.6277836556453837, "learning_rate": 9.739777932304992e-06, - "loss": 0.2687, + "loss": 0.2679, "step": 754 }, { "epoch": 0.10328317373461013, - "grad_norm": 1.5032070255845102, + "grad_norm": 1.487833817097761, "learning_rate": 9.73909330088737e-06, - "loss": 0.225, + "loss": 0.2265, "step": 755 }, { "epoch": 0.10341997264021888, - "grad_norm": 1.917637337224016, + "grad_norm": 1.8492381336998513, "learning_rate": 9.738407794163066e-06, - "loss": 0.2657, + "loss": 0.2642, "step": 756 }, { "epoch": 0.10355677154582764, - "grad_norm": 1.4810142018072434, + "grad_norm": 1.4815897135929255, "learning_rate": 9.737721412258692e-06, - "loss": 0.1954, + "loss": 0.1939, "step": 757 }, { "epoch": 0.10369357045143639, - "grad_norm": 1.4561485576914284, + "grad_norm": 1.436294351737925, "learning_rate": 9.737034155301024e-06, - "loss": 0.2299, + "loss": 0.2261, "step": 758 }, { "epoch": 0.10383036935704515, - "grad_norm": 1.536587901203591, + "grad_norm": 1.526978783206955, "learning_rate": 9.736346023416996e-06, - "loss": 0.1919, + "loss": 0.1905, "step": 759 }, { "epoch": 0.1039671682626539, - "grad_norm": 1.5302768665951334, + "grad_norm": 1.5181445826400255, "learning_rate": 9.735657016733706e-06, - "loss": 0.2003, + "loss": 0.1969, "step": 760 }, { "epoch": 0.10410396716826266, - "grad_norm": 1.6288470446534604, + "grad_norm": 1.693386015584107, "learning_rate": 9.734967135378414e-06, - "loss": 0.2383, + "loss": 0.2397, "step": 761 }, { "epoch": 0.10424076607387141, - "grad_norm": 1.3901464364313274, + "grad_norm": 1.34769228614287, "learning_rate": 9.734276379478538e-06, - "loss": 0.2016, + "loss": 0.2008, "step": 762 }, { "epoch": 0.10437756497948017, - "grad_norm": 2.1034781698187275, + "grad_norm": 2.045963497005092, "learning_rate": 9.733584749161664e-06, - "loss": 0.2901, + "loss": 0.2918, "step": 763 }, { "epoch": 0.10451436388508892, - "grad_norm": 1.7085201845117612, + "grad_norm": 1.6654851082943352, "learning_rate": 9.732892244555531e-06, - "loss": 0.2519, + "loss": 0.2511, "step": 764 }, { "epoch": 0.10465116279069768, - "grad_norm": 1.8032398362422637, + "grad_norm": 1.7574018316470883, "learning_rate": 9.732198865788047e-06, - "loss": 0.2126, + "loss": 0.2101, "step": 765 }, { "epoch": 0.10478796169630643, - "grad_norm": 1.3910072204963335, + "grad_norm": 1.3695241481192872, "learning_rate": 9.731504612987279e-06, - "loss": 0.2091, + "loss": 0.2096, "step": 766 }, { "epoch": 0.10492476060191519, - "grad_norm": 1.8117395778327254, + "grad_norm": 1.7786031244313045, "learning_rate": 9.730809486281452e-06, - "loss": 0.22, + "loss": 0.2173, "step": 767 }, { "epoch": 0.10506155950752394, - "grad_norm": 1.8857105164636774, + "grad_norm": 1.8492674801152578, "learning_rate": 9.730113485798958e-06, - "loss": 0.2353, + "loss": 0.2341, "step": 768 }, { "epoch": 0.1051983584131327, - "grad_norm": 1.2819925607325033, + "grad_norm": 1.2771501579106967, "learning_rate": 9.729416611668345e-06, - "loss": 0.1814, + "loss": 0.1809, "step": 769 }, { "epoch": 0.10533515731874145, - "grad_norm": 1.704404399964109, + "grad_norm": 1.652399053312514, "learning_rate": 9.72871886401833e-06, - "loss": 0.2348, + "loss": 0.2341, "step": 770 }, { "epoch": 0.10547195622435021, - "grad_norm": 1.6019553360970065, + "grad_norm": 1.5456763235237305, "learning_rate": 9.728020242977781e-06, - "loss": 0.2061, + "loss": 0.2065, "step": 771 }, { "epoch": 0.10560875512995896, - "grad_norm": 1.3864726019074107, + "grad_norm": 1.3496618652680632, "learning_rate": 9.727320748675734e-06, - "loss": 0.2002, + "loss": 0.2025, "step": 772 }, { "epoch": 0.10574555403556772, - "grad_norm": 1.4680814901434813, + "grad_norm": 1.4525059354774132, "learning_rate": 9.726620381241389e-06, - "loss": 0.2029, + "loss": 0.2009, "step": 773 }, { "epoch": 0.10588235294117647, - "grad_norm": 1.647609131160467, + "grad_norm": 1.6443838146481298, "learning_rate": 9.7259191408041e-06, - "loss": 0.2219, + "loss": 0.2236, "step": 774 }, { "epoch": 0.10601915184678523, - "grad_norm": 1.6197213058168747, + "grad_norm": 1.5875842777735554, "learning_rate": 9.725217027493383e-06, - "loss": 0.2149, + "loss": 0.216, "step": 775 }, { "epoch": 0.10615595075239398, - "grad_norm": 1.656992833922663, + "grad_norm": 1.6363766759192873, "learning_rate": 9.724514041438922e-06, - "loss": 0.2156, + "loss": 0.2173, "step": 776 }, { "epoch": 0.10629274965800274, - "grad_norm": 1.5317943757960244, + "grad_norm": 1.6431753323900824, "learning_rate": 9.723810182770556e-06, - "loss": 0.2228, + "loss": 0.227, "step": 777 }, { "epoch": 0.10642954856361149, - "grad_norm": 1.6797661294340727, + "grad_norm": 1.618960327881177, "learning_rate": 9.72310545161829e-06, - "loss": 0.2229, + "loss": 0.2234, "step": 778 }, { "epoch": 0.10656634746922025, - "grad_norm": 1.871194015316357, + "grad_norm": 1.806130835300612, "learning_rate": 9.722399848112283e-06, - "loss": 0.2305, + "loss": 0.2263, "step": 779 }, { "epoch": 0.106703146374829, - "grad_norm": 1.507038879781577, + "grad_norm": 1.505545143674029, "learning_rate": 9.721693372382863e-06, - "loss": 0.2342, + "loss": 0.2386, "step": 780 }, { "epoch": 0.10683994528043776, - "grad_norm": 1.7139278798374247, + "grad_norm": 1.6744153799918127, "learning_rate": 9.720986024560512e-06, - "loss": 0.2274, + "loss": 0.2268, "step": 781 }, { "epoch": 0.10697674418604651, - "grad_norm": 1.5847789503510066, + "grad_norm": 1.537184912428679, "learning_rate": 9.720277804775879e-06, - "loss": 0.2015, + "loss": 0.1995, "step": 782 }, { "epoch": 0.10711354309165527, - "grad_norm": 1.8096885171363015, + "grad_norm": 1.7648884038669486, "learning_rate": 9.719568713159771e-06, - "loss": 0.2635, + "loss": 0.2641, "step": 783 }, { "epoch": 0.10725034199726402, - "grad_norm": 1.8104029306218297, + "grad_norm": 1.7749656718161466, "learning_rate": 9.718858749843156e-06, - "loss": 0.25, + "loss": 0.2467, "step": 784 }, { "epoch": 0.10738714090287278, - "grad_norm": 1.8934839912628902, + "grad_norm": 1.8522020380467836, "learning_rate": 9.718147914957166e-06, - "loss": 0.2374, + "loss": 0.2382, "step": 785 }, { "epoch": 0.10752393980848153, - "grad_norm": 1.86868049118972, + "grad_norm": 1.8211563140879645, "learning_rate": 9.717436208633088e-06, - "loss": 0.231, + "loss": 0.2306, "step": 786 }, { "epoch": 0.10766073871409029, - "grad_norm": 1.955092261274064, + "grad_norm": 1.8824331079570247, "learning_rate": 9.716723631002379e-06, - "loss": 0.2565, + "loss": 0.2543, "step": 787 }, { "epoch": 0.10779753761969904, - "grad_norm": 1.512081562807116, + "grad_norm": 1.4605192606154218, "learning_rate": 9.716010182196645e-06, - "loss": 0.206, + "loss": 0.2076, "step": 788 }, { "epoch": 0.1079343365253078, - "grad_norm": 1.9031941013075084, + "grad_norm": 1.8811827740724913, "learning_rate": 9.715295862347662e-06, - "loss": 0.2783, + "loss": 0.2789, "step": 789 }, { "epoch": 0.10807113543091655, - "grad_norm": 2.1409685817186275, + "grad_norm": 2.0246501704971878, "learning_rate": 9.714580671587366e-06, - "loss": 0.2414, + "loss": 0.2419, "step": 790 }, { "epoch": 0.10820793433652531, - "grad_norm": 1.777865830118866, + "grad_norm": 1.7345448381017294, "learning_rate": 9.713864610047852e-06, - "loss": 0.2296, + "loss": 0.2313, "step": 791 }, { "epoch": 0.10834473324213406, - "grad_norm": 1.8242099624558779, + "grad_norm": 1.7566157274966192, "learning_rate": 9.713147677861373e-06, - "loss": 0.2469, + "loss": 0.2487, "step": 792 }, { "epoch": 0.10848153214774282, - "grad_norm": 1.523897411757695, + "grad_norm": 1.459530127620989, "learning_rate": 9.712429875160348e-06, - "loss": 0.2058, + "loss": 0.2036, "step": 793 }, { "epoch": 0.10861833105335157, - "grad_norm": 1.5283959432212941, + "grad_norm": 1.501022519594562, "learning_rate": 9.711711202077354e-06, - "loss": 0.2086, + "loss": 0.2074, "step": 794 }, { "epoch": 0.10875512995896033, - "grad_norm": 1.9742388068346999, + "grad_norm": 1.881376547894533, "learning_rate": 9.71099165874513e-06, - "loss": 0.2619, + "loss": 0.2609, "step": 795 }, { "epoch": 0.10889192886456908, - "grad_norm": 1.3617224321360004, + "grad_norm": 1.3716362958285206, "learning_rate": 9.710271245296576e-06, - "loss": 0.1931, + "loss": 0.1982, "step": 796 }, { "epoch": 0.10902872777017784, - "grad_norm": 1.4851617283683916, + "grad_norm": 1.6830553430897965, "learning_rate": 9.70954996186475e-06, - "loss": 0.1915, + "loss": 0.1912, "step": 797 }, { "epoch": 0.10916552667578659, - "grad_norm": 1.9234780799803453, + "grad_norm": 1.8644028083549096, "learning_rate": 9.70882780858287e-06, - "loss": 0.2458, + "loss": 0.2426, "step": 798 }, { "epoch": 0.10930232558139535, - "grad_norm": 1.4870522179878758, + "grad_norm": 1.434481505689956, "learning_rate": 9.708104785584324e-06, - "loss": 0.2098, + "loss": 0.2101, "step": 799 }, { "epoch": 0.1094391244870041, - "grad_norm": 1.650960564747494, + "grad_norm": 1.6038215895617776, "learning_rate": 9.707380893002647e-06, - "loss": 0.1926, + "loss": 0.1909, "step": 800 }, { "epoch": 0.1094391244870041, - "eval_loss": 0.2221343219280243, - "eval_runtime": 5.9188, - "eval_samples_per_second": 5.069, - "eval_steps_per_second": 1.352, + "eval_loss": 0.22249868512153625, + "eval_runtime": 5.9077, + "eval_samples_per_second": 5.078, + "eval_steps_per_second": 1.354, "step": 800 }, { "epoch": 0.10957592339261286, - "grad_norm": 1.5365753783633447, + "grad_norm": 1.517103988168178, "learning_rate": 9.706656130971546e-06, - "loss": 0.1683, + "loss": 0.169, "step": 801 }, { "epoch": 0.10971272229822161, - "grad_norm": 1.8877484460349339, + "grad_norm": 1.8460410768916566, "learning_rate": 9.705930499624881e-06, - "loss": 0.2823, + "loss": 0.2865, "step": 802 }, { "epoch": 0.10984952120383037, - "grad_norm": 1.4298400486236758, + "grad_norm": 1.3993297251643269, "learning_rate": 9.705203999096677e-06, - "loss": 0.2332, + "loss": 0.233, "step": 803 }, { "epoch": 0.10998632010943912, - "grad_norm": 1.6209142074942255, + "grad_norm": 1.5749554053415944, "learning_rate": 9.704476629521118e-06, - "loss": 0.2112, + "loss": 0.2095, "step": 804 }, { "epoch": 0.11012311901504789, - "grad_norm": 1.8655079629774847, + "grad_norm": 1.791760768932612, "learning_rate": 9.703748391032548e-06, - "loss": 0.2434, + "loss": 0.2393, "step": 805 }, { "epoch": 0.11025991792065663, - "grad_norm": 1.4188194243683963, + "grad_norm": 1.3844554595653296, "learning_rate": 9.703019283765472e-06, - "loss": 0.2022, + "loss": 0.2013, "step": 806 }, { "epoch": 0.1103967168262654, - "grad_norm": 1.2562279967901264, + "grad_norm": 1.2099948605406132, "learning_rate": 9.702289307854555e-06, - "loss": 0.2283, + "loss": 0.2267, "step": 807 }, { "epoch": 0.11053351573187414, - "grad_norm": 1.6191138776687761, + "grad_norm": 1.5936211288776347, "learning_rate": 9.701558463434626e-06, - "loss": 0.2094, + "loss": 0.2096, "step": 808 }, { "epoch": 0.1106703146374829, - "grad_norm": 1.7425878271135942, + "grad_norm": 1.6951073530303271, "learning_rate": 9.700826750640669e-06, - "loss": 0.2103, + "loss": 0.2088, "step": 809 }, { "epoch": 0.11080711354309165, - "grad_norm": 1.5812737146648634, + "grad_norm": 1.581084972576791, "learning_rate": 9.700094169607828e-06, - "loss": 0.2271, + "loss": 0.2252, "step": 810 }, { "epoch": 0.11094391244870042, - "grad_norm": 1.7868359740254713, + "grad_norm": 1.7510565652004064, "learning_rate": 9.699360720471415e-06, - "loss": 0.2425, + "loss": 0.2419, "step": 811 }, { "epoch": 0.11108071135430916, - "grad_norm": 1.3473068761076745, + "grad_norm": 1.2891126125438344, "learning_rate": 9.698626403366896e-06, - "loss": 0.1985, + "loss": 0.1982, "step": 812 }, { "epoch": 0.11121751025991793, - "grad_norm": 1.4108969272883316, + "grad_norm": 1.3772217620870038, "learning_rate": 9.697891218429898e-06, - "loss": 0.2183, + "loss": 0.2172, "step": 813 }, { "epoch": 0.11135430916552667, - "grad_norm": 1.7277756781502687, + "grad_norm": 1.6629194149901676, "learning_rate": 9.69715516579621e-06, - "loss": 0.2234, + "loss": 0.2208, "step": 814 }, { "epoch": 0.11149110807113544, - "grad_norm": 1.9561615258913443, + "grad_norm": 1.917579914878023, "learning_rate": 9.696418245601779e-06, - "loss": 0.261, + "loss": 0.2575, "step": 815 }, { "epoch": 0.11162790697674418, - "grad_norm": 1.7389557179292734, + "grad_norm": 1.6958768178274497, "learning_rate": 9.695680457982713e-06, - "loss": 0.2678, + "loss": 0.2704, "step": 816 }, { "epoch": 0.11176470588235295, - "grad_norm": 1.6222541637516674, + "grad_norm": 1.5596675409341691, "learning_rate": 9.694941803075285e-06, - "loss": 0.2127, + "loss": 0.2106, "step": 817 }, { "epoch": 0.1119015047879617, - "grad_norm": 1.6741046933135195, + "grad_norm": 1.6231155634487135, "learning_rate": 9.694202281015918e-06, - "loss": 0.1978, + "loss": 0.1956, "step": 818 }, { "epoch": 0.11203830369357046, - "grad_norm": 1.8302120616772717, + "grad_norm": 1.808912252407902, "learning_rate": 9.693461891941206e-06, - "loss": 0.2399, + "loss": 0.2394, "step": 819 }, { "epoch": 0.1121751025991792, - "grad_norm": 1.2644470164967392, + "grad_norm": 1.1760788028512603, "learning_rate": 9.692720635987893e-06, - "loss": 0.1758, + "loss": 0.1741, "step": 820 }, { "epoch": 0.11231190150478797, - "grad_norm": 1.4642777301055836, + "grad_norm": 1.4065823929196828, "learning_rate": 9.691978513292896e-06, - "loss": 0.2049, + "loss": 0.2058, "step": 821 }, { "epoch": 0.11244870041039672, - "grad_norm": 2.002640795021754, + "grad_norm": 1.9204673425705987, "learning_rate": 9.691235523993277e-06, - "loss": 0.285, + "loss": 0.2793, "step": 822 }, { "epoch": 0.11258549931600548, - "grad_norm": 1.5956693249046903, + "grad_norm": 1.5677203746903827, "learning_rate": 9.690491668226271e-06, - "loss": 0.2275, + "loss": 0.2283, "step": 823 }, { "epoch": 0.11272229822161423, - "grad_norm": 1.588472036566204, + "grad_norm": 1.5580860934974066, "learning_rate": 9.689746946129264e-06, - "loss": 0.2539, + "loss": 0.2529, "step": 824 }, { "epoch": 0.11285909712722299, - "grad_norm": 1.560407112555853, + "grad_norm": 1.5306202381047247, "learning_rate": 9.689001357839807e-06, - "loss": 0.206, + "loss": 0.2061, "step": 825 }, { "epoch": 0.11299589603283174, - "grad_norm": 1.6071908490827806, + "grad_norm": 1.571725354887242, "learning_rate": 9.68825490349561e-06, "loss": 0.2508, "step": 826 }, { "epoch": 0.1131326949384405, - "grad_norm": 1.7455917204305214, + "grad_norm": 1.685121320284102, "learning_rate": 9.687507583234542e-06, - "loss": 0.2363, + "loss": 0.2365, "step": 827 }, { "epoch": 0.11326949384404925, - "grad_norm": 1.9219054976114847, + "grad_norm": 1.9042022664265987, "learning_rate": 9.686759397194631e-06, - "loss": 0.2685, + "loss": 0.2689, "step": 828 }, { "epoch": 0.11340629274965801, - "grad_norm": 1.5547974883209266, + "grad_norm": 1.4951008484794095, "learning_rate": 9.686010345514068e-06, - "loss": 0.2089, + "loss": 0.2087, "step": 829 }, { "epoch": 0.11354309165526676, - "grad_norm": 1.476450064298634, + "grad_norm": 1.3832419781351415, "learning_rate": 9.685260428331203e-06, - "loss": 0.1887, + "loss": 0.1884, "step": 830 }, { "epoch": 0.11367989056087552, - "grad_norm": 1.3828657256127301, + "grad_norm": 1.3396895475500945, "learning_rate": 9.684509645784543e-06, - "loss": 0.2219, + "loss": 0.2216, "step": 831 }, { "epoch": 0.11381668946648427, - "grad_norm": 1.8863349016331346, + "grad_norm": 1.8111442147130636, "learning_rate": 9.683757998012758e-06, - "loss": 0.2458, + "loss": 0.2479, "step": 832 }, { "epoch": 0.11395348837209303, - "grad_norm": 1.3497542103474338, + "grad_norm": 1.2906638685350034, "learning_rate": 9.683005485154677e-06, - "loss": 0.184, + "loss": 0.1839, "step": 833 }, { "epoch": 0.11409028727770178, - "grad_norm": 1.6619010500322116, + "grad_norm": 1.6098136561803467, "learning_rate": 9.682252107349289e-06, - "loss": 0.2207, + "loss": 0.2185, "step": 834 }, { "epoch": 0.11422708618331054, - "grad_norm": 1.4526466439601933, + "grad_norm": 1.4210713201266978, "learning_rate": 9.68149786473574e-06, - "loss": 0.1749, + "loss": 0.174, "step": 835 }, { "epoch": 0.11436388508891929, - "grad_norm": 1.8508978477075413, + "grad_norm": 1.9246835991962437, "learning_rate": 9.68074275745334e-06, - "loss": 0.2586, + "loss": 0.2694, "step": 836 }, { "epoch": 0.11450068399452805, - "grad_norm": 1.5765958544933487, + "grad_norm": 1.743802861896785, "learning_rate": 9.679986785641555e-06, - "loss": 0.1917, + "loss": 0.1886, "step": 837 }, { "epoch": 0.1146374829001368, - "grad_norm": 1.684164190125632, + "grad_norm": 1.6459883719806634, "learning_rate": 9.679229949440015e-06, - "loss": 0.2203, + "loss": 0.2181, "step": 838 }, { "epoch": 0.11477428180574556, - "grad_norm": 1.4801835233339014, + "grad_norm": 1.4169831121626262, "learning_rate": 9.678472248988507e-06, - "loss": 0.2092, + "loss": 0.2087, "step": 839 }, { "epoch": 0.11491108071135431, - "grad_norm": 1.3867269963125233, + "grad_norm": 1.339753044943085, "learning_rate": 9.677713684426973e-06, - "loss": 0.1826, + "loss": 0.1828, "step": 840 }, { "epoch": 0.11504787961696307, - "grad_norm": 1.6931150192134614, + "grad_norm": 1.5921623510507275, "learning_rate": 9.676954255895524e-06, - "loss": 0.2102, + "loss": 0.2045, "step": 841 }, { "epoch": 0.11518467852257182, - "grad_norm": 1.9038427479486677, + "grad_norm": 1.7955045824246827, "learning_rate": 9.676193963534424e-06, - "loss": 0.2489, + "loss": 0.2462, "step": 842 }, { "epoch": 0.11532147742818058, - "grad_norm": 1.2286028202867398, + "grad_norm": 1.1657112730153112, "learning_rate": 9.6754328074841e-06, - "loss": 0.1921, + "loss": 0.1903, "step": 843 }, { "epoch": 0.11545827633378933, - "grad_norm": 1.713618140186367, + "grad_norm": 1.6654212140733697, "learning_rate": 9.674670787885135e-06, - "loss": 0.2201, + "loss": 0.2216, "step": 844 }, { "epoch": 0.11559507523939809, - "grad_norm": 1.584461600900667, + "grad_norm": 1.5287541773422377, "learning_rate": 9.673907904878272e-06, - "loss": 0.2114, + "loss": 0.2129, "step": 845 }, { "epoch": 0.11573187414500684, - "grad_norm": 1.852404149282569, + "grad_norm": 1.784935843359226, "learning_rate": 9.67314415860442e-06, - "loss": 0.3038, + "loss": 0.3001, "step": 846 }, { "epoch": 0.1158686730506156, - "grad_norm": 1.6622382641297349, + "grad_norm": 1.6326095099506672, "learning_rate": 9.672379549204637e-06, - "loss": 0.233, + "loss": 0.2309, "step": 847 }, { "epoch": 0.11600547195622435, - "grad_norm": 1.5407343870908112, + "grad_norm": 1.5361340182713503, "learning_rate": 9.671614076820148e-06, - "loss": 0.2183, + "loss": 0.2175, "step": 848 }, { "epoch": 0.11614227086183311, - "grad_norm": 1.5384173190103485, + "grad_norm": 1.6065310378358415, "learning_rate": 9.670847741592335e-06, - "loss": 0.1895, + "loss": 0.1892, "step": 849 }, { "epoch": 0.11627906976744186, - "grad_norm": 1.51889483815328, + "grad_norm": 1.4902595489742592, "learning_rate": 9.670080543662742e-06, - "loss": 0.2117, + "loss": 0.2136, "step": 850 }, { "epoch": 0.11641586867305062, - "grad_norm": 1.4462997663146042, + "grad_norm": 1.4058243094835094, "learning_rate": 9.669312483173065e-06, - "loss": 0.2067, + "loss": 0.2062, "step": 851 }, { "epoch": 0.11655266757865937, - "grad_norm": 1.7607604943525887, + "grad_norm": 1.787716678600279, "learning_rate": 9.668543560265167e-06, - "loss": 0.2595, + "loss": 0.2601, "step": 852 }, { "epoch": 0.11668946648426813, - "grad_norm": 1.6731483325806882, + "grad_norm": 1.6054168159831927, "learning_rate": 9.667773775081066e-06, - "loss": 0.2294, + "loss": 0.2302, "step": 853 }, { "epoch": 0.11682626538987688, - "grad_norm": 1.756077042500769, + "grad_norm": 1.6934430312276918, "learning_rate": 9.667003127762942e-06, - "loss": 0.2506, + "loss": 0.2487, "step": 854 }, { "epoch": 0.11696306429548564, - "grad_norm": 1.739682462391431, + "grad_norm": 1.6864894860835746, "learning_rate": 9.666231618453135e-06, - "loss": 0.218, + "loss": 0.2197, "step": 855 }, { "epoch": 0.11709986320109439, - "grad_norm": 1.3325915959736137, + "grad_norm": 1.312210891794796, "learning_rate": 9.665459247294137e-06, - "loss": 0.1897, + "loss": 0.1923, "step": 856 }, { "epoch": 0.11723666210670315, - "grad_norm": 1.7205345914197923, + "grad_norm": 1.664521222730214, "learning_rate": 9.664686014428607e-06, - "loss": 0.2792, + "loss": 0.2804, "step": 857 }, { "epoch": 0.1173734610123119, - "grad_norm": 1.695324679333535, + "grad_norm": 1.6478652910908853, "learning_rate": 9.663911919999363e-06, - "loss": 0.2303, + "loss": 0.2274, "step": 858 }, { "epoch": 0.11751025991792066, - "grad_norm": 1.6192927899917449, + "grad_norm": 1.5702844117665318, "learning_rate": 9.663136964149375e-06, - "loss": 0.2151, + "loss": 0.216, "step": 859 }, { "epoch": 0.11764705882352941, - "grad_norm": 1.5122920100545902, + "grad_norm": 1.5043491961969824, "learning_rate": 9.66236114702178e-06, - "loss": 0.2328, + "loss": 0.2347, "step": 860 }, { "epoch": 0.11778385772913817, - "grad_norm": 1.4200371891477785, + "grad_norm": 1.3901140075686311, "learning_rate": 9.66158446875987e-06, - "loss": 0.2118, + "loss": 0.2127, "step": 861 }, { "epoch": 0.11792065663474692, - "grad_norm": 1.6312545115049957, + "grad_norm": 1.5991505312380117, "learning_rate": 9.660806929507096e-06, - "loss": 0.2457, + "loss": 0.2477, "step": 862 }, { "epoch": 0.11805745554035568, - "grad_norm": 1.4383775486292274, + "grad_norm": 1.4653174574518926, "learning_rate": 9.660028529407068e-06, - "loss": 0.231, + "loss": 0.2329, "step": 863 }, { "epoch": 0.11819425444596443, - "grad_norm": 1.3650657062720735, + "grad_norm": 1.3470999917630542, "learning_rate": 9.65924926860356e-06, - "loss": 0.1813, + "loss": 0.181, "step": 864 }, { "epoch": 0.11833105335157319, - "grad_norm": 1.522458671035, + "grad_norm": 1.4655859308777575, "learning_rate": 9.658469147240494e-06, - "loss": 0.2076, + "loss": 0.2056, "step": 865 }, { "epoch": 0.11846785225718194, - "grad_norm": 1.6496080533406514, + "grad_norm": 1.6446087517992736, "learning_rate": 9.657688165461965e-06, - "loss": 0.2292, + "loss": 0.2286, "step": 866 }, { "epoch": 0.1186046511627907, - "grad_norm": 1.8916787627512117, + "grad_norm": 1.8488370274646202, "learning_rate": 9.656906323412216e-06, - "loss": 0.2624, + "loss": 0.2647, "step": 867 }, { "epoch": 0.11874145006839945, - "grad_norm": 1.5818461712675638, + "grad_norm": 1.5233983278015792, "learning_rate": 9.656123621235653e-06, - "loss": 0.1893, + "loss": 0.1903, "step": 868 }, { "epoch": 0.11887824897400821, - "grad_norm": 1.9919221787408534, + "grad_norm": 1.9711205265201497, "learning_rate": 9.655340059076841e-06, - "loss": 0.2503, + "loss": 0.252, "step": 869 }, { "epoch": 0.11901504787961696, - "grad_norm": 1.7177383540220739, + "grad_norm": 1.6803676915858576, "learning_rate": 9.654555637080503e-06, - "loss": 0.246, + "loss": 0.2471, "step": 870 }, { "epoch": 0.11915184678522572, - "grad_norm": 1.4999938862866373, + "grad_norm": 1.4662147340913951, "learning_rate": 9.653770355391518e-06, - "loss": 0.2065, + "loss": 0.2064, "step": 871 }, { "epoch": 0.11928864569083447, - "grad_norm": 1.6016729424798415, + "grad_norm": 1.5511473080855862, "learning_rate": 9.652984214154935e-06, - "loss": 0.2279, + "loss": 0.2221, "step": 872 }, { "epoch": 0.11942544459644323, - "grad_norm": 1.4613185830436586, + "grad_norm": 1.4298308719580435, "learning_rate": 9.652197213515944e-06, - "loss": 0.2149, + "loss": 0.2151, "step": 873 }, { "epoch": 0.11956224350205198, - "grad_norm": 1.8267866671643322, + "grad_norm": 1.78429039528054, "learning_rate": 9.651409353619911e-06, - "loss": 0.2477, + "loss": 0.2496, "step": 874 }, { "epoch": 0.11969904240766074, - "grad_norm": 1.385253742259595, + "grad_norm": 1.3599638625141715, "learning_rate": 9.65062063461235e-06, - "loss": 0.191, + "loss": 0.1895, "step": 875 }, { "epoch": 0.11983584131326949, - "grad_norm": 1.9291510197508965, + "grad_norm": 1.906805063769183, "learning_rate": 9.649831056638933e-06, - "loss": 0.2733, + "loss": 0.274, "step": 876 }, { "epoch": 0.11997264021887825, - "grad_norm": 1.5280299579330967, + "grad_norm": 1.503817521970632, "learning_rate": 9.649040619845502e-06, - "loss": 0.1917, + "loss": 0.1925, "step": 877 }, { "epoch": 0.120109439124487, - "grad_norm": 1.578711268872527, + "grad_norm": 1.5734898010618532, "learning_rate": 9.648249324378044e-06, - "loss": 0.1942, + "loss": 0.1926, "step": 878 }, { "epoch": 0.12024623803009576, - "grad_norm": 1.5744176712547278, + "grad_norm": 1.5577547491508843, "learning_rate": 9.647457170382715e-06, - "loss": 0.2189, + "loss": 0.2204, "step": 879 }, { "epoch": 0.12038303693570451, - "grad_norm": 1.6416632229324932, + "grad_norm": 1.595384123918987, "learning_rate": 9.646664158005823e-06, - "loss": 0.1828, + "loss": 0.182, "step": 880 }, { "epoch": 0.12051983584131327, - "grad_norm": 1.648668227578867, + "grad_norm": 1.6160414510732213, "learning_rate": 9.645870287393835e-06, - "loss": 0.1994, + "loss": 0.1986, "step": 881 }, { "epoch": 0.12065663474692202, - "grad_norm": 1.7738758151649747, + "grad_norm": 1.7498383674240043, "learning_rate": 9.64507555869338e-06, - "loss": 0.2474, + "loss": 0.2467, "step": 882 }, { "epoch": 0.12079343365253079, - "grad_norm": 1.6494500486056587, + "grad_norm": 1.5940770666655906, "learning_rate": 9.644279972051245e-06, - "loss": 0.2361, + "loss": 0.2332, "step": 883 }, { "epoch": 0.12093023255813953, - "grad_norm": 1.5998102371125267, + "grad_norm": 1.5538062338477765, "learning_rate": 9.643483527614372e-06, - "loss": 0.2447, + "loss": 0.2421, "step": 884 }, { "epoch": 0.1210670314637483, - "grad_norm": 1.6478128748668026, + "grad_norm": 1.5861246133521394, "learning_rate": 9.642686225529864e-06, "loss": 0.258, "step": 885 }, { "epoch": 0.12120383036935704, - "grad_norm": 1.5486619887380826, + "grad_norm": 1.6213035208212658, "learning_rate": 9.641888065944985e-06, - "loss": 0.2166, + "loss": 0.2182, "step": 886 }, { "epoch": 0.1213406292749658, - "grad_norm": 1.9861616346554642, + "grad_norm": 2.006468370378643, "learning_rate": 9.641089049007151e-06, - "loss": 0.2648, + "loss": 0.2683, "step": 887 }, { "epoch": 0.12147742818057455, - "grad_norm": 1.7642876956070586, + "grad_norm": 1.6441608967643195, "learning_rate": 9.640289174863941e-06, - "loss": 0.2662, + "loss": 0.2667, "step": 888 }, { "epoch": 0.12161422708618332, - "grad_norm": 1.6792243068939474, + "grad_norm": 1.6351300852311237, "learning_rate": 9.63948844366309e-06, - "loss": 0.2105, + "loss": 0.2107, "step": 889 }, { "epoch": 0.12175102599179206, - "grad_norm": 1.5348628368777353, + "grad_norm": 1.529005181585231, "learning_rate": 9.638686855552494e-06, - "loss": 0.2359, + "loss": 0.2396, "step": 890 }, { "epoch": 0.12188782489740083, - "grad_norm": 1.8384278901829565, + "grad_norm": 1.7176764514633767, "learning_rate": 9.637884410680205e-06, "loss": 0.2304, "step": 891 }, { "epoch": 0.12202462380300957, - "grad_norm": 1.627185060895049, + "grad_norm": 1.599243648876235, "learning_rate": 9.637081109194435e-06, - "loss": 0.2257, + "loss": 0.2287, "step": 892 }, { "epoch": 0.12216142270861834, - "grad_norm": 1.8867136037644385, + "grad_norm": 1.8645710145929535, "learning_rate": 9.63627695124355e-06, - "loss": 0.2835, + "loss": 0.2877, "step": 893 }, { "epoch": 0.12229822161422708, - "grad_norm": 1.492172901302062, + "grad_norm": 1.465745610986922, "learning_rate": 9.635471936976081e-06, - "loss": 0.2457, + "loss": 0.2475, "step": 894 }, { "epoch": 0.12243502051983585, - "grad_norm": 1.4281309693278386, + "grad_norm": 1.4086832403513823, "learning_rate": 9.634666066540713e-06, - "loss": 0.1818, + "loss": 0.1826, "step": 895 }, { "epoch": 0.1225718194254446, - "grad_norm": 1.8288805710675557, + "grad_norm": 1.8102081928090306, "learning_rate": 9.633859340086287e-06, - "loss": 0.2997, + "loss": 0.3007, "step": 896 }, { "epoch": 0.12270861833105336, - "grad_norm": 1.1896455364932113, + "grad_norm": 1.1724413219122458, "learning_rate": 9.633051757761806e-06, - "loss": 0.2076, + "loss": 0.2083, "step": 897 }, { "epoch": 0.1228454172366621, - "grad_norm": 1.8522648505576025, + "grad_norm": 1.8283359678878734, "learning_rate": 9.63224331971643e-06, - "loss": 0.2517, + "loss": 0.2472, "step": 898 }, { "epoch": 0.12298221614227087, - "grad_norm": 1.6626090381797658, + "grad_norm": 1.6323001521573863, "learning_rate": 9.631434026099477e-06, - "loss": 0.2835, + "loss": 0.2816, "step": 899 }, { "epoch": 0.12311901504787962, - "grad_norm": 1.5052562012650716, + "grad_norm": 1.4670478531192521, "learning_rate": 9.630623877060423e-06, - "loss": 0.2274, + "loss": 0.2277, "step": 900 }, { "epoch": 0.12311901504787962, - "eval_loss": 0.21955183148384094, - "eval_runtime": 5.9154, - "eval_samples_per_second": 5.072, - "eval_steps_per_second": 1.352, + "eval_loss": 0.2201305627822876, + "eval_runtime": 5.9233, + "eval_samples_per_second": 5.065, + "eval_steps_per_second": 1.351, "step": 900 }, { "epoch": 0.12325581395348838, - "grad_norm": 1.5183110846186931, + "grad_norm": 1.490112589094878, "learning_rate": 9.629812872748901e-06, - "loss": 0.2154, + "loss": 0.2179, "step": 901 }, { "epoch": 0.12339261285909713, - "grad_norm": 1.3108255085425469, + "grad_norm": 1.28562851124207, "learning_rate": 9.629001013314706e-06, - "loss": 0.1678, + "loss": 0.1711, "step": 902 }, { "epoch": 0.12352941176470589, - "grad_norm": 1.7167089991494238, + "grad_norm": 1.6126251673574044, "learning_rate": 9.628188298907782e-06, - "loss": 0.2171, + "loss": 0.2165, "step": 903 }, { "epoch": 0.12366621067031464, - "grad_norm": 1.6171065086684442, + "grad_norm": 1.6400477279551284, "learning_rate": 9.627374729678241e-06, - "loss": 0.2119, + "loss": 0.2113, "step": 904 }, { "epoch": 0.1238030095759234, - "grad_norm": 1.5937797482692375, + "grad_norm": 1.5334478037243597, "learning_rate": 9.626560305776349e-06, - "loss": 0.2478, + "loss": 0.2492, "step": 905 }, { "epoch": 0.12393980848153215, - "grad_norm": 1.9666078591143696, + "grad_norm": 1.8915702411112432, "learning_rate": 9.625745027352527e-06, - "loss": 0.2423, + "loss": 0.2428, "step": 906 }, { "epoch": 0.12407660738714091, - "grad_norm": 1.5698444202728554, + "grad_norm": 1.5237154059880103, "learning_rate": 9.624928894557355e-06, - "loss": 0.2065, + "loss": 0.206, "step": 907 }, { "epoch": 0.12421340629274966, - "grad_norm": 1.9356297956708635, + "grad_norm": 1.8711951556741766, "learning_rate": 9.624111907541578e-06, - "loss": 0.2332, + "loss": 0.2342, "step": 908 }, { "epoch": 0.12435020519835842, - "grad_norm": 1.7897815634407608, + "grad_norm": 1.7567406463846467, "learning_rate": 9.623294066456088e-06, - "loss": 0.2289, + "loss": 0.2316, "step": 909 }, { "epoch": 0.12448700410396717, - "grad_norm": 1.6679257479306135, + "grad_norm": 1.6381271336524035, "learning_rate": 9.62247537145194e-06, - "loss": 0.2313, + "loss": 0.232, "step": 910 }, { "epoch": 0.12462380300957593, - "grad_norm": 1.5609615978701747, + "grad_norm": 1.512952146053514, "learning_rate": 9.621655822680347e-06, - "loss": 0.2127, + "loss": 0.2101, "step": 911 }, { "epoch": 0.12476060191518468, - "grad_norm": 1.062928339736277, + "grad_norm": 1.0875374506792603, "learning_rate": 9.620835420292678e-06, - "loss": 0.1756, + "loss": 0.1795, "step": 912 }, { "epoch": 0.12489740082079344, - "grad_norm": 1.3601198352065924, + "grad_norm": 1.2850277477174146, "learning_rate": 9.620014164440463e-06, - "loss": 0.1843, + "loss": 0.1821, "step": 913 }, { "epoch": 0.1250341997264022, - "grad_norm": 1.7650964270494764, + "grad_norm": 1.6826014959059135, "learning_rate": 9.619192055275385e-06, - "loss": 0.2078, + "loss": 0.2063, "step": 914 }, { "epoch": 0.12517099863201095, - "grad_norm": 1.4014201991780588, + "grad_norm": 1.3826252721277257, "learning_rate": 9.618369092949289e-06, - "loss": 0.2193, + "loss": 0.2208, "step": 915 }, { "epoch": 0.1253077975376197, - "grad_norm": 1.5192207851262451, + "grad_norm": 1.4924543251406803, "learning_rate": 9.617545277614175e-06, - "loss": 0.237, + "loss": 0.2386, "step": 916 }, { "epoch": 0.12544459644322845, - "grad_norm": 1.7242737936718475, + "grad_norm": 1.69331690635752, "learning_rate": 9.616720609422198e-06, - "loss": 0.2463, + "loss": 0.243, "step": 917 }, { "epoch": 0.12558139534883722, - "grad_norm": 1.4452878609827358, + "grad_norm": 1.42083253243916, "learning_rate": 9.615895088525677e-06, - "loss": 0.2069, + "loss": 0.208, "step": 918 }, { "epoch": 0.12571819425444597, - "grad_norm": 1.8202937449848187, + "grad_norm": 1.7569935985594003, "learning_rate": 9.615068715077084e-06, - "loss": 0.2253, + "loss": 0.2292, "step": 919 }, { "epoch": 0.12585499316005472, - "grad_norm": 1.6410462328154949, + "grad_norm": 1.5822495741049925, "learning_rate": 9.61424148922905e-06, - "loss": 0.2397, + "loss": 0.2394, "step": 920 }, { "epoch": 0.12599179206566347, - "grad_norm": 1.4576406195577645, + "grad_norm": 1.4327563747802292, "learning_rate": 9.61341341113436e-06, - "loss": 0.2069, + "loss": 0.206, "step": 921 }, { "epoch": 0.12612859097127224, - "grad_norm": 1.7963758101627052, + "grad_norm": 1.765079134177624, "learning_rate": 9.612584480945964e-06, - "loss": 0.2424, + "loss": 0.2393, "step": 922 }, { "epoch": 0.126265389876881, - "grad_norm": 1.4275272269912347, + "grad_norm": 1.42646601851578, "learning_rate": 9.611754698816961e-06, - "loss": 0.2121, + "loss": 0.213, "step": 923 }, { "epoch": 0.12640218878248974, - "grad_norm": 1.6450383579143384, + "grad_norm": 1.6281437771069647, "learning_rate": 9.610924064900615e-06, - "loss": 0.2258, + "loss": 0.2235, "step": 924 }, { "epoch": 0.1265389876880985, - "grad_norm": 1.5134250158724822, + "grad_norm": 1.4903897315632797, "learning_rate": 9.610092579350339e-06, - "loss": 0.2031, + "loss": 0.2018, "step": 925 }, { "epoch": 0.12667578659370726, - "grad_norm": 1.7067326169032244, + "grad_norm": 1.6363507063037082, "learning_rate": 9.60926024231971e-06, - "loss": 0.2253, + "loss": 0.2247, "step": 926 }, { "epoch": 0.126812585499316, - "grad_norm": 1.5259897642694245, + "grad_norm": 1.473207752990306, "learning_rate": 9.60842705396246e-06, - "loss": 0.2002, + "loss": 0.1973, "step": 927 }, { "epoch": 0.12694938440492476, - "grad_norm": 1.8797920260331087, + "grad_norm": 1.8478197790791695, "learning_rate": 9.607593014432478e-06, - "loss": 0.2207, + "loss": 0.2195, "step": 928 }, { "epoch": 0.1270861833105335, - "grad_norm": 1.7168478468283084, + "grad_norm": 1.7029218301789375, "learning_rate": 9.606758123883808e-06, - "loss": 0.2338, + "loss": 0.2345, "step": 929 }, { "epoch": 0.12722298221614228, - "grad_norm": 1.7486859078442503, + "grad_norm": 1.6710257657622838, "learning_rate": 9.605922382470659e-06, - "loss": 0.2543, + "loss": 0.2542, "step": 930 }, { "epoch": 0.12735978112175103, - "grad_norm": 1.8212938447937685, + "grad_norm": 1.7856782614849478, "learning_rate": 9.605085790347386e-06, - "loss": 0.2495, + "loss": 0.251, "step": 931 }, { "epoch": 0.12749658002735978, - "grad_norm": 1.6446567860172872, + "grad_norm": 1.6013659223801684, "learning_rate": 9.60424834766851e-06, - "loss": 0.2016, + "loss": 0.1991, "step": 932 }, { "epoch": 0.12763337893296853, - "grad_norm": 1.4815244706721038, + "grad_norm": 1.4817080066840833, "learning_rate": 9.603410054588706e-06, - "loss": 0.18, + "loss": 0.1819, "step": 933 }, { "epoch": 0.1277701778385773, - "grad_norm": 2.0959753717351703, + "grad_norm": 1.9961825188133397, "learning_rate": 9.602570911262805e-06, - "loss": 0.2204, + "loss": 0.2185, "step": 934 }, { "epoch": 0.12790697674418605, - "grad_norm": 1.6185423885937362, + "grad_norm": 1.5928799369069724, "learning_rate": 9.601730917845798e-06, - "loss": 0.2263, + "loss": 0.227, "step": 935 }, { "epoch": 0.1280437756497948, - "grad_norm": 1.4452636964001169, + "grad_norm": 1.4140081987319977, "learning_rate": 9.600890074492828e-06, - "loss": 0.2045, + "loss": 0.2057, "step": 936 }, { "epoch": 0.12818057455540355, - "grad_norm": 1.6256175323314608, + "grad_norm": 1.597233288799531, "learning_rate": 9.600048381359201e-06, - "loss": 0.2407, + "loss": 0.2383, "step": 937 }, { "epoch": 0.12831737346101232, - "grad_norm": 1.4975114451928964, + "grad_norm": 1.5187779770263858, "learning_rate": 9.599205838600377e-06, - "loss": 0.2189, + "loss": 0.2211, "step": 938 }, { "epoch": 0.12845417236662107, - "grad_norm": 1.660379990692983, + "grad_norm": 1.6324994841941027, "learning_rate": 9.59836244637197e-06, - "loss": 0.2455, + "loss": 0.2462, "step": 939 }, { "epoch": 0.12859097127222982, - "grad_norm": 1.5145707980225354, + "grad_norm": 1.4866317718934565, "learning_rate": 9.597518204829755e-06, - "loss": 0.1937, + "loss": 0.1956, "step": 940 }, { "epoch": 0.12872777017783857, - "grad_norm": 1.5462458168696454, + "grad_norm": 1.5500472889352395, "learning_rate": 9.596673114129665e-06, - "loss": 0.1906, + "loss": 0.1914, "step": 941 }, { "epoch": 0.12886456908344734, - "grad_norm": 1.5944868841780033, + "grad_norm": 1.6005593747060969, "learning_rate": 9.595827174427785e-06, - "loss": 0.2085, + "loss": 0.2115, "step": 942 }, { "epoch": 0.1290013679890561, - "grad_norm": 1.4829044397063607, + "grad_norm": 1.482645509406993, "learning_rate": 9.594980385880361e-06, - "loss": 0.1866, + "loss": 0.1903, "step": 943 }, { "epoch": 0.12913816689466484, - "grad_norm": 1.7477503314532488, + "grad_norm": 1.7153851898613546, "learning_rate": 9.594132748643793e-06, - "loss": 0.2536, + "loss": 0.2553, "step": 944 }, { "epoch": 0.1292749658002736, - "grad_norm": 1.8507054629854947, + "grad_norm": 1.7565860145775698, "learning_rate": 9.59328426287464e-06, - "loss": 0.3041, + "loss": 0.3081, "step": 945 }, { "epoch": 0.12941176470588237, - "grad_norm": 1.6182703912843455, + "grad_norm": 1.5558112436160965, "learning_rate": 9.592434928729617e-06, - "loss": 0.2379, + "loss": 0.2363, "step": 946 }, { "epoch": 0.1295485636114911, - "grad_norm": 1.600653205967049, + "grad_norm": 1.534057111316954, "learning_rate": 9.591584746365591e-06, - "loss": 0.2431, + "loss": 0.2451, "step": 947 }, { "epoch": 0.12968536251709986, - "grad_norm": 1.478823956470237, + "grad_norm": 1.4590172963042192, "learning_rate": 9.590733715939596e-06, - "loss": 0.2051, + "loss": 0.2084, "step": 948 }, { "epoch": 0.1298221614227086, - "grad_norm": 1.8585655745964977, + "grad_norm": 1.776425375971605, "learning_rate": 9.589881837608814e-06, - "loss": 0.1966, + "loss": 0.1967, "step": 949 }, { "epoch": 0.12995896032831739, - "grad_norm": 1.3526888714099574, + "grad_norm": 1.3043773554985678, "learning_rate": 9.589029111530585e-06, - "loss": 0.2076, + "loss": 0.2057, "step": 950 }, { "epoch": 0.13009575923392613, - "grad_norm": 1.5393652833367206, + "grad_norm": 1.507606015215302, "learning_rate": 9.588175537862409e-06, - "loss": 0.2266, + "loss": 0.2279, "step": 951 }, { "epoch": 0.13023255813953488, - "grad_norm": 1.8689307635167318, + "grad_norm": 1.8020665657108281, "learning_rate": 9.587321116761938e-06, - "loss": 0.2519, + "loss": 0.2537, "step": 952 }, { "epoch": 0.13036935704514363, - "grad_norm": 1.484198274733043, + "grad_norm": 1.4540055825618403, "learning_rate": 9.586465848386985e-06, - "loss": 0.2209, + "loss": 0.2194, "step": 953 }, { "epoch": 0.1305061559507524, - "grad_norm": 1.6538825446721455, + "grad_norm": 1.6291073118274275, "learning_rate": 9.585609732895518e-06, - "loss": 0.2294, + "loss": 0.2322, "step": 954 }, { "epoch": 0.13064295485636115, - "grad_norm": 1.8371943035526308, + "grad_norm": 1.7670353053283911, "learning_rate": 9.584752770445658e-06, - "loss": 0.2466, + "loss": 0.2471, "step": 955 }, { "epoch": 0.1307797537619699, - "grad_norm": 1.5528776053259226, + "grad_norm": 1.5599039099716927, "learning_rate": 9.583894961195687e-06, - "loss": 0.2237, + "loss": 0.228, "step": 956 }, { "epoch": 0.13091655266757865, - "grad_norm": 1.5866724978235067, + "grad_norm": 1.5473808215655591, "learning_rate": 9.583036305304042e-06, - "loss": 0.2217, + "loss": 0.2215, "step": 957 }, { "epoch": 0.13105335157318743, - "grad_norm": 1.6293593364701482, + "grad_norm": 1.499069414995805, "learning_rate": 9.582176802929314e-06, - "loss": 0.1977, + "loss": 0.1962, "step": 958 }, { "epoch": 0.13119015047879617, - "grad_norm": 1.5073193351313185, + "grad_norm": 1.446768980325549, "learning_rate": 9.581316454230256e-06, - "loss": 0.1989, + "loss": 0.1993, "step": 959 }, { "epoch": 0.13132694938440492, - "grad_norm": 1.773335751608117, + "grad_norm": 1.745870102130804, "learning_rate": 9.58045525936577e-06, - "loss": 0.2632, + "loss": 0.2686, "step": 960 }, { "epoch": 0.13146374829001367, - "grad_norm": 1.6561407562518293, + "grad_norm": 1.6399804440805315, "learning_rate": 9.57959321849492e-06, - "loss": 0.2138, + "loss": 0.2139, "step": 961 }, { "epoch": 0.13160054719562245, - "grad_norm": 1.5815324125940686, + "grad_norm": 1.5344363311966607, "learning_rate": 9.578730331776924e-06, - "loss": 0.2111, + "loss": 0.2129, "step": 962 }, { "epoch": 0.1317373461012312, - "grad_norm": 1.6031851442118317, + "grad_norm": 1.5579167902845577, "learning_rate": 9.577866599371156e-06, - "loss": 0.2073, + "loss": 0.2078, "step": 963 }, { "epoch": 0.13187414500683994, - "grad_norm": 1.58979848098135, + "grad_norm": 1.6369857927887848, "learning_rate": 9.577002021437147e-06, - "loss": 0.2041, + "loss": 0.2061, "step": 964 }, { "epoch": 0.1320109439124487, - "grad_norm": 1.6209494515357994, + "grad_norm": 1.456059507104976, "learning_rate": 9.576136598134584e-06, - "loss": 0.2443, + "loss": 0.2463, "step": 965 }, { "epoch": 0.13214774281805747, - "grad_norm": 1.6403688051939376, + "grad_norm": 1.7219755608691303, "learning_rate": 9.57527032962331e-06, - "loss": 0.2613, + "loss": 0.2671, "step": 966 }, { "epoch": 0.13228454172366622, - "grad_norm": 1.7084272730965546, + "grad_norm": 1.6596791230446477, "learning_rate": 9.574403216063323e-06, - "loss": 0.2076, + "loss": 0.2073, "step": 967 }, { "epoch": 0.13242134062927496, - "grad_norm": 1.6857721175839648, + "grad_norm": 1.6414053888482765, "learning_rate": 9.57353525761478e-06, - "loss": 0.2448, + "loss": 0.2469, "step": 968 }, { "epoch": 0.1325581395348837, - "grad_norm": 1.1226192582521404, + "grad_norm": 1.0385919185298564, "learning_rate": 9.572666454437992e-06, - "loss": 0.1706, + "loss": 0.1695, "step": 969 }, { "epoch": 0.1326949384404925, - "grad_norm": 1.3971844247229994, + "grad_norm": 1.3777849751979179, "learning_rate": 9.571796806693423e-06, - "loss": 0.2256, + "loss": 0.2276, "step": 970 }, { "epoch": 0.13283173734610124, - "grad_norm": 1.7265126423990047, + "grad_norm": 1.657332437967876, "learning_rate": 9.5709263145417e-06, - "loss": 0.2247, + "loss": 0.2254, "step": 971 }, { "epoch": 0.13296853625170998, - "grad_norm": 1.6560150308884922, + "grad_norm": 1.6884567051533843, "learning_rate": 9.570054978143601e-06, - "loss": 0.2434, + "loss": 0.2437, "step": 972 }, { "epoch": 0.13310533515731873, - "grad_norm": 1.555130357564959, + "grad_norm": 1.535244492161775, "learning_rate": 9.569182797660061e-06, - "loss": 0.196, + "loss": 0.199, "step": 973 }, { "epoch": 0.1332421340629275, - "grad_norm": 1.9824140785735045, + "grad_norm": 1.9895497751367694, "learning_rate": 9.568309773252172e-06, - "loss": 0.274, + "loss": 0.2754, "step": 974 }, { "epoch": 0.13337893296853626, - "grad_norm": 1.5668368101849106, + "grad_norm": 1.5886583665876628, "learning_rate": 9.567435905081179e-06, - "loss": 0.2389, + "loss": 0.2388, "step": 975 }, { "epoch": 0.133515731874145, - "grad_norm": 1.4287147381479206, + "grad_norm": 1.4153441421073127, "learning_rate": 9.566561193308486e-06, - "loss": 0.2132, + "loss": 0.2136, "step": 976 }, { "epoch": 0.13365253077975375, - "grad_norm": 2.0186576377988765, + "grad_norm": 1.976273661245743, "learning_rate": 9.56568563809565e-06, - "loss": 0.2732, + "loss": 0.274, "step": 977 }, { "epoch": 0.13378932968536253, - "grad_norm": 1.6495735127187032, + "grad_norm": 1.6202681295340275, "learning_rate": 9.564809239604387e-06, - "loss": 0.2153, + "loss": 0.2163, "step": 978 }, { "epoch": 0.13392612859097128, - "grad_norm": 1.6546314564569051, + "grad_norm": 1.5741379980842334, "learning_rate": 9.563931997996568e-06, - "loss": 0.2543, + "loss": 0.2458, "step": 979 }, { "epoch": 0.13406292749658003, - "grad_norm": 1.4096461831927778, + "grad_norm": 1.334873527760215, "learning_rate": 9.563053913434218e-06, - "loss": 0.198, + "loss": 0.197, "step": 980 }, { "epoch": 0.13419972640218877, - "grad_norm": 1.6925005672478783, + "grad_norm": 1.7017198170472998, "learning_rate": 9.562174986079516e-06, - "loss": 0.2064, + "loss": 0.2077, "step": 981 }, { "epoch": 0.13433652530779755, - "grad_norm": 1.7404300222664382, + "grad_norm": 1.7494068611871947, "learning_rate": 9.5612952160948e-06, - "loss": 0.2823, + "loss": 0.2837, "step": 982 }, { "epoch": 0.1344733242134063, - "grad_norm": 1.8022725111619973, + "grad_norm": 1.7938449946902189, "learning_rate": 9.560414603642567e-06, - "loss": 0.25, + "loss": 0.2503, "step": 983 }, { "epoch": 0.13461012311901505, - "grad_norm": 1.6886589158741279, + "grad_norm": 1.6421813637439577, "learning_rate": 9.559533148885462e-06, - "loss": 0.2775, + "loss": 0.2753, "step": 984 }, { "epoch": 0.1347469220246238, - "grad_norm": 1.4488807296234534, + "grad_norm": 1.4288141061255748, "learning_rate": 9.558650851986288e-06, - "loss": 0.209, + "loss": 0.2097, "step": 985 }, { "epoch": 0.13488372093023257, - "grad_norm": 1.6732859399474986, + "grad_norm": 1.6586827272538343, "learning_rate": 9.557767713108009e-06, - "loss": 0.2542, + "loss": 0.255, "step": 986 }, { "epoch": 0.13502051983584132, - "grad_norm": 1.012512655367708, + "grad_norm": 0.9903591144102767, "learning_rate": 9.556883732413733e-06, - "loss": 0.1557, + "loss": 0.156, "step": 987 }, { "epoch": 0.13515731874145007, - "grad_norm": 1.4791029834306841, + "grad_norm": 1.4448519559266613, "learning_rate": 9.555998910066736e-06, - "loss": 0.2216, + "loss": 0.2222, "step": 988 }, { "epoch": 0.13529411764705881, - "grad_norm": 1.6913303323858668, + "grad_norm": 1.6459664138276606, "learning_rate": 9.555113246230443e-06, - "loss": 0.2505, + "loss": 0.2492, "step": 989 }, { "epoch": 0.1354309165526676, - "grad_norm": 1.3985642833851932, + "grad_norm": 1.3767398126108708, "learning_rate": 9.554226741068433e-06, - "loss": 0.2088, + "loss": 0.2107, "step": 990 }, { "epoch": 0.13556771545827634, - "grad_norm": 1.7671109173253408, + "grad_norm": 1.7358224474503756, "learning_rate": 9.553339394744447e-06, - "loss": 0.2385, + "loss": 0.2417, "step": 991 }, { "epoch": 0.1357045143638851, - "grad_norm": 1.7611157328017053, + "grad_norm": 1.7209078259737942, "learning_rate": 9.552451207422373e-06, - "loss": 0.2703, + "loss": 0.2691, "step": 992 }, { "epoch": 0.13584131326949384, - "grad_norm": 1.4487067799512523, + "grad_norm": 1.4087699601034702, "learning_rate": 9.551562179266261e-06, - "loss": 0.2084, + "loss": 0.206, "step": 993 }, { "epoch": 0.1359781121751026, - "grad_norm": 1.519169826407112, + "grad_norm": 1.4532117767662327, "learning_rate": 9.550672310440313e-06, - "loss": 0.1991, + "loss": 0.1953, "step": 994 }, { "epoch": 0.13611491108071136, - "grad_norm": 1.614002307250497, + "grad_norm": 1.551668013885579, "learning_rate": 9.549781601108885e-06, - "loss": 0.2297, + "loss": 0.2283, "step": 995 }, { "epoch": 0.1362517099863201, - "grad_norm": 1.744703245636844, + "grad_norm": 1.76483725759696, "learning_rate": 9.548890051436494e-06, - "loss": 0.2766, + "loss": 0.2771, "step": 996 }, { "epoch": 0.13638850889192886, - "grad_norm": 1.1858528946450688, + "grad_norm": 1.156355575545727, "learning_rate": 9.547997661587808e-06, - "loss": 0.1719, + "loss": 0.1708, "step": 997 }, { "epoch": 0.13652530779753763, - "grad_norm": 1.3659290708518177, + "grad_norm": 1.287680088165759, "learning_rate": 9.547104431727648e-06, - "loss": 0.1588, + "loss": 0.1559, "step": 998 }, { "epoch": 0.13666210670314638, - "grad_norm": 1.321889913206172, + "grad_norm": 1.3031431324302223, "learning_rate": 9.546210362020995e-06, - "loss": 0.1649, + "loss": 0.1672, "step": 999 }, { "epoch": 0.13679890560875513, - "grad_norm": 1.6347949875112098, + "grad_norm": 1.5709832225548501, "learning_rate": 9.545315452632981e-06, - "loss": 0.2124, + "loss": 0.2108, "step": 1000 }, { "epoch": 0.13679890560875513, - "eval_loss": 0.2186584770679474, - "eval_runtime": 5.9279, - "eval_samples_per_second": 5.061, + "eval_loss": 0.2181551158428192, + "eval_runtime": 5.9267, + "eval_samples_per_second": 5.062, "eval_steps_per_second": 1.35, "step": 1000 }, { "epoch": 0.13693570451436388, - "grad_norm": 1.5618985502336478, + "grad_norm": 1.5225513611047108, "learning_rate": 9.544419703728898e-06, - "loss": 0.2138, + "loss": 0.2146, "step": 1001 }, { "epoch": 0.13707250341997265, - "grad_norm": 1.4909828752013403, + "grad_norm": 1.4662952090457753, "learning_rate": 9.543523115474186e-06, "loss": 0.2053, "step": 1002 }, { "epoch": 0.1372093023255814, - "grad_norm": 1.4606608532233896, + "grad_norm": 1.4900761279487988, "learning_rate": 9.542625688034449e-06, - "loss": 0.2283, + "loss": 0.2298, "step": 1003 }, { "epoch": 0.13734610123119015, - "grad_norm": 1.422121285647258, + "grad_norm": 1.4001212496017552, "learning_rate": 9.541727421575438e-06, - "loss": 0.2075, + "loss": 0.2116, "step": 1004 }, { "epoch": 0.1374829001367989, - "grad_norm": 1.8405159641013809, + "grad_norm": 1.7929049542631557, "learning_rate": 9.540828316263061e-06, - "loss": 0.2, + "loss": 0.1976, "step": 1005 }, { "epoch": 0.13761969904240767, - "grad_norm": 1.307856780057247, + "grad_norm": 1.2758761363489148, "learning_rate": 9.539928372263387e-06, - "loss": 0.1782, + "loss": 0.1773, "step": 1006 }, { "epoch": 0.13775649794801642, - "grad_norm": 1.567538616333544, + "grad_norm": 1.5431380548056126, "learning_rate": 9.539027589742629e-06, - "loss": 0.2344, + "loss": 0.2337, "step": 1007 }, { "epoch": 0.13789329685362517, - "grad_norm": 1.3844681360105495, + "grad_norm": 1.3507990017716465, "learning_rate": 9.538125968867164e-06, - "loss": 0.1992, + "loss": 0.2012, "step": 1008 }, { "epoch": 0.13803009575923392, - "grad_norm": 1.196250372607776, + "grad_norm": 1.1679912123879579, "learning_rate": 9.537223509803522e-06, - "loss": 0.2014, + "loss": 0.2009, "step": 1009 }, { "epoch": 0.1381668946648427, - "grad_norm": 1.4854110450210276, + "grad_norm": 1.4511421924530594, "learning_rate": 9.536320212718381e-06, - "loss": 0.2288, + "loss": 0.2269, "step": 1010 }, { "epoch": 0.13830369357045144, - "grad_norm": 1.2832519152188142, + "grad_norm": 1.269252224120806, "learning_rate": 9.535416077778586e-06, - "loss": 0.1939, + "loss": 0.1941, "step": 1011 }, { "epoch": 0.1384404924760602, - "grad_norm": 1.7758231734359695, + "grad_norm": 1.7082739145555383, "learning_rate": 9.534511105151126e-06, - "loss": 0.2327, + "loss": 0.2308, "step": 1012 }, { "epoch": 0.13857729138166894, - "grad_norm": 1.505220527319113, + "grad_norm": 1.4733020499927585, "learning_rate": 9.53360529500315e-06, - "loss": 0.1941, + "loss": 0.1929, "step": 1013 }, { "epoch": 0.1387140902872777, - "grad_norm": 1.4707191492195506, + "grad_norm": 1.393651913280143, "learning_rate": 9.532698647501959e-06, - "loss": 0.2116, + "loss": 0.2095, "step": 1014 }, { "epoch": 0.13885088919288646, - "grad_norm": 1.394226942494672, + "grad_norm": 1.3570607761611184, "learning_rate": 9.531791162815009e-06, - "loss": 0.2296, + "loss": 0.2275, "step": 1015 }, { "epoch": 0.1389876880984952, - "grad_norm": 1.5910325099554539, + "grad_norm": 1.6003478501014483, "learning_rate": 9.530882841109916e-06, - "loss": 0.1859, + "loss": 0.1889, "step": 1016 }, { "epoch": 0.13912448700410396, - "grad_norm": 1.425435329779327, + "grad_norm": 1.4194594609266924, "learning_rate": 9.529973682554446e-06, - "loss": 0.2001, + "loss": 0.1997, "step": 1017 }, { "epoch": 0.13926128590971273, - "grad_norm": 1.7614890028011916, + "grad_norm": 1.7326165237794104, "learning_rate": 9.529063687316513e-06, - "loss": 0.2475, + "loss": 0.2473, "step": 1018 }, { "epoch": 0.13939808481532148, - "grad_norm": 1.3088490229910243, + "grad_norm": 1.2667606296444267, "learning_rate": 9.5281528555642e-06, - "loss": 0.1703, + "loss": 0.1713, "step": 1019 }, { "epoch": 0.13953488372093023, - "grad_norm": 1.5206047272298657, + "grad_norm": 1.472609328791644, "learning_rate": 9.527241187465735e-06, - "loss": 0.1774, + "loss": 0.1787, "step": 1020 }, { "epoch": 0.13967168262653898, - "grad_norm": 1.4987190461507727, + "grad_norm": 1.4584306862762093, "learning_rate": 9.526328683189498e-06, - "loss": 0.2244, + "loss": 0.2257, "step": 1021 }, { "epoch": 0.13980848153214775, - "grad_norm": 1.5652109607749418, + "grad_norm": 1.5371833877474035, "learning_rate": 9.525415342904034e-06, - "loss": 0.2453, + "loss": 0.2448, "step": 1022 }, { "epoch": 0.1399452804377565, - "grad_norm": 1.271443142843219, + "grad_norm": 1.3021114444779687, "learning_rate": 9.524501166778032e-06, - "loss": 0.1735, + "loss": 0.177, "step": 1023 }, { "epoch": 0.14008207934336525, - "grad_norm": 1.4172600184005644, + "grad_norm": 1.3514013990210474, "learning_rate": 9.523586154980343e-06, - "loss": 0.1863, + "loss": 0.1831, "step": 1024 }, { "epoch": 0.140218878248974, - "grad_norm": 1.4668425493254038, + "grad_norm": 1.440026359784615, "learning_rate": 9.522670307679964e-06, - "loss": 0.2262, + "loss": 0.2254, "step": 1025 }, { "epoch": 0.14035567715458278, - "grad_norm": 1.7452521612619762, + "grad_norm": 1.6845181161273766, "learning_rate": 9.521753625046056e-06, - "loss": 0.2771, + "loss": 0.277, "step": 1026 }, { "epoch": 0.14049247606019152, - "grad_norm": 1.820783977105652, + "grad_norm": 1.7849811747438733, "learning_rate": 9.520836107247928e-06, - "loss": 0.2216, + "loss": 0.2208, "step": 1027 }, { "epoch": 0.14062927496580027, - "grad_norm": 1.2960627920752734, + "grad_norm": 1.2673775175021451, "learning_rate": 9.519917754455043e-06, - "loss": 0.2122, + "loss": 0.2114, "step": 1028 }, { "epoch": 0.14076607387140902, - "grad_norm": 1.6067672631712286, + "grad_norm": 1.5924425135271338, "learning_rate": 9.518998566837023e-06, - "loss": 0.2445, + "loss": 0.2465, "step": 1029 }, { "epoch": 0.1409028727770178, - "grad_norm": 1.9855888288869825, + "grad_norm": 1.9073028870404147, "learning_rate": 9.518078544563639e-06, - "loss": 0.3091, + "loss": 0.3026, "step": 1030 }, { "epoch": 0.14103967168262654, - "grad_norm": 1.8068189330107094, + "grad_norm": 1.7585612646759017, "learning_rate": 9.517157687804819e-06, - "loss": 0.2445, + "loss": 0.2439, "step": 1031 }, { "epoch": 0.1411764705882353, - "grad_norm": 1.6184801148905743, + "grad_norm": 1.5852694888396788, "learning_rate": 9.516235996730645e-06, - "loss": 0.198, + "loss": 0.1982, "step": 1032 }, { "epoch": 0.14131326949384404, - "grad_norm": 1.4307513816936994, + "grad_norm": 1.4110959620618624, "learning_rate": 9.515313471511352e-06, - "loss": 0.246, + "loss": 0.2393, "step": 1033 }, { "epoch": 0.14145006839945282, - "grad_norm": 1.2989396657807273, + "grad_norm": 1.365359333558735, "learning_rate": 9.51439011231733e-06, - "loss": 0.1825, + "loss": 0.1847, "step": 1034 }, { "epoch": 0.14158686730506156, - "grad_norm": 1.7915475167915969, + "grad_norm": 1.7217976358140032, "learning_rate": 9.513465919319122e-06, - "loss": 0.2109, + "loss": 0.2063, "step": 1035 }, { "epoch": 0.1417236662106703, - "grad_norm": 1.4286537043309542, + "grad_norm": 1.4023083534784997, "learning_rate": 9.512540892687427e-06, - "loss": 0.2143, + "loss": 0.2164, "step": 1036 }, { "epoch": 0.14186046511627906, - "grad_norm": 1.282500832401146, + "grad_norm": 1.2211893176077078, "learning_rate": 9.511615032593096e-06, - "loss": 0.1907, + "loss": 0.1882, "step": 1037 }, { "epoch": 0.14199726402188784, - "grad_norm": 1.6196828071022682, + "grad_norm": 1.544276214035704, "learning_rate": 9.510688339207133e-06, - "loss": 0.2309, + "loss": 0.2333, "step": 1038 }, { "epoch": 0.14213406292749658, - "grad_norm": 1.351639859748053, + "grad_norm": 1.3407699884305078, "learning_rate": 9.509760812700702e-06, - "loss": 0.1983, + "loss": 0.1978, "step": 1039 }, { "epoch": 0.14227086183310533, - "grad_norm": 1.1905814776930557, + "grad_norm": 1.158219309033472, "learning_rate": 9.50883245324511e-06, - "loss": 0.1865, + "loss": 0.185, "step": 1040 }, { "epoch": 0.14240766073871408, - "grad_norm": 1.307564542464256, + "grad_norm": 1.263334412814359, "learning_rate": 9.50790326101183e-06, - "loss": 0.2166, + "loss": 0.2138, "step": 1041 }, { "epoch": 0.14254445964432286, - "grad_norm": 1.6717269126922958, + "grad_norm": 1.598201957341862, "learning_rate": 9.50697323617248e-06, - "loss": 0.2324, + "loss": 0.2291, "step": 1042 }, { "epoch": 0.1426812585499316, - "grad_norm": 1.1919451389024507, + "grad_norm": 1.1656927391326248, "learning_rate": 9.506042378898834e-06, - "loss": 0.1746, + "loss": 0.1738, "step": 1043 }, { "epoch": 0.14281805745554035, - "grad_norm": 1.6757868595660124, + "grad_norm": 1.688963568531523, "learning_rate": 9.505110689362825e-06, - "loss": 0.258, + "loss": 0.2599, "step": 1044 }, { "epoch": 0.1429548563611491, - "grad_norm": 1.8379486106553518, + "grad_norm": 1.7950556207115822, "learning_rate": 9.50417816773653e-06, - "loss": 0.2667, + "loss": 0.2665, "step": 1045 }, { "epoch": 0.14309165526675788, - "grad_norm": 1.5910433442646192, + "grad_norm": 1.5041745248422778, "learning_rate": 9.503244814192187e-06, - "loss": 0.236, + "loss": 0.2371, "step": 1046 }, { "epoch": 0.14322845417236663, - "grad_norm": 1.4202406121139561, + "grad_norm": 1.371064606552926, "learning_rate": 9.502310628902188e-06, - "loss": 0.1985, + "loss": 0.1975, "step": 1047 }, { "epoch": 0.14336525307797537, - "grad_norm": 1.2715820835089835, + "grad_norm": 1.2657983910041093, "learning_rate": 9.501375612039074e-06, - "loss": 0.2197, + "loss": 0.2191, "step": 1048 }, { "epoch": 0.14350205198358412, - "grad_norm": 1.4600384975379657, + "grad_norm": 1.417426182630162, "learning_rate": 9.500439763775543e-06, - "loss": 0.2078, + "loss": 0.2074, "step": 1049 }, { "epoch": 0.1436388508891929, - "grad_norm": 1.562006407332479, + "grad_norm": 1.5290243506269547, "learning_rate": 9.499503084284441e-06, - "loss": 0.239, + "loss": 0.2391, "step": 1050 }, { "epoch": 0.14377564979480165, - "grad_norm": 1.2737112364644727, + "grad_norm": 1.2263697785600647, "learning_rate": 9.498565573738778e-06, - "loss": 0.174, + "loss": 0.1748, "step": 1051 }, { "epoch": 0.1439124487004104, - "grad_norm": 1.955210967693477, + "grad_norm": 1.942579496850577, "learning_rate": 9.49762723231171e-06, - "loss": 0.2541, + "loss": 0.2517, "step": 1052 }, { "epoch": 0.14404924760601914, - "grad_norm": 1.792288152918348, + "grad_norm": 1.740424561439871, "learning_rate": 9.496688060176545e-06, - "loss": 0.2456, + "loss": 0.2448, "step": 1053 }, { "epoch": 0.14418604651162792, - "grad_norm": 1.5859348221292402, + "grad_norm": 1.5369249102249307, "learning_rate": 9.49574805750675e-06, - "loss": 0.2184, + "loss": 0.2201, "step": 1054 }, { "epoch": 0.14432284541723667, - "grad_norm": 1.2479074109777146, + "grad_norm": 1.2469695507970358, "learning_rate": 9.494807224475942e-06, - "loss": 0.1841, + "loss": 0.1877, "step": 1055 }, { "epoch": 0.14445964432284542, - "grad_norm": 1.5709680308491873, + "grad_norm": 1.5513606090942726, "learning_rate": 9.493865561257892e-06, - "loss": 0.2184, + "loss": 0.2205, "step": 1056 }, { "epoch": 0.14459644322845416, - "grad_norm": 1.6872094148205992, + "grad_norm": 1.644315501126776, "learning_rate": 9.492923068026524e-06, - "loss": 0.2496, + "loss": 0.2471, "step": 1057 }, { "epoch": 0.14473324213406294, - "grad_norm": 1.4410059816259246, + "grad_norm": 1.4430178208495916, "learning_rate": 9.491979744955917e-06, - "loss": 0.2237, + "loss": 0.2234, "step": 1058 }, { "epoch": 0.1448700410396717, - "grad_norm": 1.381404103036876, + "grad_norm": 1.3559456870973443, "learning_rate": 9.491035592220299e-06, - "loss": 0.2263, + "loss": 0.2253, "step": 1059 }, { "epoch": 0.14500683994528044, - "grad_norm": 1.2258951728667835, + "grad_norm": 1.1913284712549872, "learning_rate": 9.490090609994059e-06, - "loss": 0.186, + "loss": 0.1836, "step": 1060 }, { "epoch": 0.14514363885088918, - "grad_norm": 1.363994358644381, + "grad_norm": 1.3249003116473976, "learning_rate": 9.48914479845173e-06, - "loss": 0.1916, + "loss": 0.1919, "step": 1061 }, { "epoch": 0.14528043775649796, - "grad_norm": 1.825817265955615, + "grad_norm": 1.7816756510496519, "learning_rate": 9.488198157768006e-06, - "loss": 0.2114, + "loss": 0.2128, "step": 1062 }, { "epoch": 0.1454172366621067, - "grad_norm": 1.4060166959738098, + "grad_norm": 1.3648889035529828, "learning_rate": 9.487250688117728e-06, - "loss": 0.223, + "loss": 0.2221, "step": 1063 }, { "epoch": 0.14555403556771546, - "grad_norm": 1.2730906729205822, + "grad_norm": 1.234812195009586, "learning_rate": 9.486302389675894e-06, - "loss": 0.2027, + "loss": 0.2022, "step": 1064 }, { "epoch": 0.1456908344733242, - "grad_norm": 1.1711727593310228, + "grad_norm": 1.147335428497544, "learning_rate": 9.485353262617656e-06, "loss": 0.1938, "step": 1065 }, { "epoch": 0.14582763337893298, - "grad_norm": 1.8121156795453144, + "grad_norm": 1.7512133399348992, "learning_rate": 9.484403307118312e-06, "loss": 0.2284, "step": 1066 }, { "epoch": 0.14596443228454173, - "grad_norm": 1.8480419863498547, + "grad_norm": 1.7996691216053626, "learning_rate": 9.483452523353325e-06, - "loss": 0.275, + "loss": 0.2748, "step": 1067 }, { "epoch": 0.14610123119015048, - "grad_norm": 1.7907410679196167, + "grad_norm": 1.7608410720677097, "learning_rate": 9.482500911498298e-06, - "loss": 0.2319, + "loss": 0.2329, "step": 1068 }, { "epoch": 0.14623803009575922, - "grad_norm": 1.835980964803756, + "grad_norm": 1.836815448915647, "learning_rate": 9.481548471728995e-06, - "loss": 0.2482, + "loss": 0.2537, "step": 1069 }, { "epoch": 0.146374829001368, - "grad_norm": 1.5328366730055099, + "grad_norm": 1.5252785968447298, "learning_rate": 9.480595204221331e-06, - "loss": 0.226, + "loss": 0.2267, "step": 1070 }, { "epoch": 0.14651162790697675, - "grad_norm": 1.4924089229764377, + "grad_norm": 1.4294154793811253, "learning_rate": 9.479641109151373e-06, - "loss": 0.1876, + "loss": 0.1862, "step": 1071 }, { "epoch": 0.1466484268125855, - "grad_norm": 1.2571807734449019, + "grad_norm": 1.233154246021892, "learning_rate": 9.478686186695343e-06, - "loss": 0.2129, + "loss": 0.2132, "step": 1072 }, { "epoch": 0.14678522571819425, - "grad_norm": 1.2923878698488387, + "grad_norm": 1.2747895022654392, "learning_rate": 9.477730437029613e-06, - "loss": 0.1922, + "loss": 0.1928, "step": 1073 }, { "epoch": 0.14692202462380302, - "grad_norm": 1.5053299628448351, + "grad_norm": 1.4581758857143772, "learning_rate": 9.47677386033071e-06, - "loss": 0.1971, + "loss": 0.1954, "step": 1074 }, { "epoch": 0.14705882352941177, - "grad_norm": 1.7154305584573775, + "grad_norm": 1.661723233532023, "learning_rate": 9.475816456775313e-06, - "loss": 0.2172, + "loss": 0.2192, "step": 1075 }, { "epoch": 0.14719562243502052, - "grad_norm": 1.5643614974098106, + "grad_norm": 1.5241015511457205, "learning_rate": 9.474858226540254e-06, - "loss": 0.2154, + "loss": 0.2165, "step": 1076 }, { "epoch": 0.14733242134062927, - "grad_norm": 1.6703449841269022, + "grad_norm": 1.6275368574637898, "learning_rate": 9.473899169802514e-06, - "loss": 0.2447, + "loss": 0.2463, "step": 1077 }, { "epoch": 0.14746922024623804, - "grad_norm": 1.4042405226370827, + "grad_norm": 1.3602433278917534, "learning_rate": 9.472939286739235e-06, - "loss": 0.2025, + "loss": 0.2053, "step": 1078 }, { "epoch": 0.1476060191518468, - "grad_norm": 1.4253319023880946, + "grad_norm": 1.3725880026768595, "learning_rate": 9.471978577527704e-06, - "loss": 0.196, + "loss": 0.1945, "step": 1079 }, { "epoch": 0.14774281805745554, - "grad_norm": 1.505527558647484, + "grad_norm": 1.4520434190250113, "learning_rate": 9.471017042345364e-06, - "loss": 0.175, + "loss": 0.1742, "step": 1080 }, { "epoch": 0.1478796169630643, - "grad_norm": 1.436390359237173, + "grad_norm": 1.5275731446931742, "learning_rate": 9.470054681369808e-06, - "loss": 0.2131, + "loss": 0.222, "step": 1081 }, { "epoch": 0.14801641586867306, - "grad_norm": 1.29298143197028, + "grad_norm": 1.2591730817599465, "learning_rate": 9.469091494778784e-06, - "loss": 0.2123, + "loss": 0.2147, "step": 1082 }, { "epoch": 0.1481532147742818, - "grad_norm": 1.379598493014287, + "grad_norm": 1.3277266915360189, "learning_rate": 9.468127482750196e-06, - "loss": 0.1929, + "loss": 0.1951, "step": 1083 }, { "epoch": 0.14829001367989056, - "grad_norm": 1.4984359229018918, + "grad_norm": 1.4446328268728819, "learning_rate": 9.467162645462088e-06, - "loss": 0.2082, + "loss": 0.2101, "step": 1084 }, { "epoch": 0.1484268125854993, - "grad_norm": 1.512752533182597, + "grad_norm": 1.4801609182566886, "learning_rate": 9.466196983092673e-06, - "loss": 0.2064, + "loss": 0.2053, "step": 1085 }, { "epoch": 0.14856361149110808, - "grad_norm": 1.6288951563832306, + "grad_norm": 1.627188416817548, "learning_rate": 9.465230495820303e-06, - "loss": 0.239, + "loss": 0.241, "step": 1086 }, { "epoch": 0.14870041039671683, - "grad_norm": 1.7735583333854121, + "grad_norm": 1.6894773446541638, "learning_rate": 9.464263183823489e-06, - "loss": 0.2248, + "loss": 0.2236, "step": 1087 }, { "epoch": 0.14883720930232558, - "grad_norm": 1.6559570317976202, + "grad_norm": 1.6211553932147396, "learning_rate": 9.463295047280892e-06, - "loss": 0.2134, + "loss": 0.2148, "step": 1088 }, { "epoch": 0.14897400820793433, - "grad_norm": 1.7340451670863644, + "grad_norm": 1.6963009570949215, "learning_rate": 9.462326086371327e-06, - "loss": 0.2569, + "loss": 0.2566, "step": 1089 }, { "epoch": 0.1491108071135431, - "grad_norm": 1.4696842723208492, + "grad_norm": 1.4931439413369993, "learning_rate": 9.461356301273758e-06, - "loss": 0.2363, + "loss": 0.2377, "step": 1090 }, { "epoch": 0.14924760601915185, - "grad_norm": 1.7176025139436573, + "grad_norm": 1.638281492528195, "learning_rate": 9.460385692167309e-06, - "loss": 0.2626, + "loss": 0.256, "step": 1091 }, { "epoch": 0.1493844049247606, - "grad_norm": 1.559869030944428, + "grad_norm": 1.5058106913845528, "learning_rate": 9.459414259231245e-06, - "loss": 0.2202, + "loss": 0.2179, "step": 1092 }, { "epoch": 0.14952120383036935, - "grad_norm": 1.8535641532638387, + "grad_norm": 1.8095875588683883, "learning_rate": 9.45844200264499e-06, - "loss": 0.2407, + "loss": 0.2395, "step": 1093 }, { "epoch": 0.14965800273597812, - "grad_norm": 1.5462290189423489, + "grad_norm": 1.520089672578454, "learning_rate": 9.45746892258812e-06, - "loss": 0.2405, + "loss": 0.236, "step": 1094 }, { "epoch": 0.14979480164158687, - "grad_norm": 1.629667140550357, + "grad_norm": 1.576222006186045, "learning_rate": 9.456495019240363e-06, - "loss": 0.2317, + "loss": 0.2329, "step": 1095 }, { "epoch": 0.14993160054719562, - "grad_norm": 1.3730173190784432, + "grad_norm": 1.3175084895671265, "learning_rate": 9.455520292781598e-06, - "loss": 0.1812, + "loss": 0.1789, "step": 1096 }, { "epoch": 0.15006839945280437, - "grad_norm": 1.6259220974533162, + "grad_norm": 1.6270938777214738, "learning_rate": 9.454544743391854e-06, - "loss": 0.2838, + "loss": 0.284, "step": 1097 }, { "epoch": 0.15020519835841314, - "grad_norm": 1.222599019438367, + "grad_norm": 1.1648487234343536, "learning_rate": 9.453568371251317e-06, - "loss": 0.1813, + "loss": 0.1799, "step": 1098 }, { "epoch": 0.1503419972640219, - "grad_norm": 1.77566895620792, + "grad_norm": 1.7422166203774876, "learning_rate": 9.452591176540318e-06, - "loss": 0.2398, + "loss": 0.2414, "step": 1099 }, { "epoch": 0.15047879616963064, - "grad_norm": 1.5517951835343442, + "grad_norm": 1.4873461564950203, "learning_rate": 9.45161315943935e-06, - "loss": 0.1941, + "loss": 0.1904, "step": 1100 }, { "epoch": 0.15047879616963064, - "eval_loss": 0.21709683537483215, - "eval_runtime": 5.9247, - "eval_samples_per_second": 5.064, - "eval_steps_per_second": 1.35, + "eval_loss": 0.21724645793437958, + "eval_runtime": 5.9153, + "eval_samples_per_second": 5.072, + "eval_steps_per_second": 1.352, "step": 1100 }, { "epoch": 0.1506155950752394, - "grad_norm": 1.3750763254859117, + "grad_norm": 1.3361381948443796, "learning_rate": 9.450634320129047e-06, - "loss": 0.1996, + "loss": 0.1972, "step": 1101 }, { "epoch": 0.15075239398084817, - "grad_norm": 1.385918216304398, + "grad_norm": 1.3465519159308557, "learning_rate": 9.449654658790201e-06, - "loss": 0.1935, + "loss": 0.1912, "step": 1102 }, { "epoch": 0.1508891928864569, - "grad_norm": 1.579180880587784, + "grad_norm": 1.5412452983800198, "learning_rate": 9.448674175603756e-06, - "loss": 0.2353, + "loss": 0.2342, "step": 1103 }, { "epoch": 0.15102599179206566, - "grad_norm": 1.3862803259984517, + "grad_norm": 1.3731257195721196, "learning_rate": 9.447692870750804e-06, - "loss": 0.2374, + "loss": 0.2377, "step": 1104 }, { "epoch": 0.1511627906976744, - "grad_norm": 1.6332897771118924, + "grad_norm": 1.591596532214332, "learning_rate": 9.446710744412595e-06, - "loss": 0.2068, + "loss": 0.2089, "step": 1105 }, { "epoch": 0.15129958960328319, - "grad_norm": 1.4227601553699611, + "grad_norm": 1.3618903500222994, "learning_rate": 9.445727796770524e-06, - "loss": 0.1977, + "loss": 0.1976, "step": 1106 }, { "epoch": 0.15143638850889193, - "grad_norm": 1.664800172678358, + "grad_norm": 1.6231473718996012, "learning_rate": 9.444744028006141e-06, - "loss": 0.2446, + "loss": 0.2424, "step": 1107 }, { "epoch": 0.15157318741450068, - "grad_norm": 1.5622112879991576, + "grad_norm": 1.5950312573606946, "learning_rate": 9.44375943830115e-06, - "loss": 0.2375, + "loss": 0.2437, "step": 1108 }, { "epoch": 0.15170998632010943, - "grad_norm": 1.566658488182113, + "grad_norm": 1.5536236373186472, "learning_rate": 9.442774027837398e-06, "loss": 0.2123, "step": 1109 }, { "epoch": 0.1518467852257182, - "grad_norm": 1.508712431715731, + "grad_norm": 1.4903841845220505, "learning_rate": 9.441787796796896e-06, - "loss": 0.2409, + "loss": 0.2374, "step": 1110 }, { "epoch": 0.15198358413132695, - "grad_norm": 1.5753177204226514, + "grad_norm": 1.5367563342476152, "learning_rate": 9.440800745361797e-06, - "loss": 0.249, + "loss": 0.2489, "step": 1111 }, { "epoch": 0.1521203830369357, - "grad_norm": 1.6469405391768006, + "grad_norm": 1.631643979665918, "learning_rate": 9.439812873714407e-06, - "loss": 0.2639, + "loss": 0.265, "step": 1112 }, { "epoch": 0.15225718194254445, - "grad_norm": 1.4596118276569963, + "grad_norm": 1.4091690813842228, "learning_rate": 9.438824182037188e-06, - "loss": 0.2327, + "loss": 0.2351, "step": 1113 }, { "epoch": 0.15239398084815323, - "grad_norm": 1.3747512745589623, + "grad_norm": 1.3078297268884207, "learning_rate": 9.437834670512749e-06, - "loss": 0.2249, + "loss": 0.2171, "step": 1114 }, { "epoch": 0.15253077975376197, - "grad_norm": 1.2876169259102728, + "grad_norm": 1.2468938583471014, "learning_rate": 9.436844339323855e-06, - "loss": 0.1919, + "loss": 0.1922, "step": 1115 }, { "epoch": 0.15266757865937072, - "grad_norm": 1.3833035924737438, + "grad_norm": 1.3536941964068576, "learning_rate": 9.435853188653413e-06, - "loss": 0.1758, + "loss": 0.1795, "step": 1116 }, { "epoch": 0.15280437756497947, - "grad_norm": 1.6156168625208633, + "grad_norm": 1.6198305003036926, "learning_rate": 9.434861218684493e-06, - "loss": 0.2432, + "loss": 0.2464, "step": 1117 }, { "epoch": 0.15294117647058825, - "grad_norm": 1.7693459958257711, + "grad_norm": 1.758616013629407, "learning_rate": 9.43386842960031e-06, - "loss": 0.2327, + "loss": 0.2361, "step": 1118 }, { "epoch": 0.153077975376197, - "grad_norm": 1.409205838397046, + "grad_norm": 1.381025687695733, "learning_rate": 9.43287482158423e-06, - "loss": 0.1816, + "loss": 0.1837, "step": 1119 }, { "epoch": 0.15321477428180574, - "grad_norm": 1.6780730277937528, + "grad_norm": 1.6160332460580842, "learning_rate": 9.431880394819774e-06, - "loss": 0.1984, + "loss": 0.1987, "step": 1120 }, { "epoch": 0.1533515731874145, - "grad_norm": 1.1324309219999729, + "grad_norm": 1.1329730673738716, "learning_rate": 9.430885149490609e-06, - "loss": 0.176, + "loss": 0.1793, "step": 1121 }, { "epoch": 0.15348837209302327, - "grad_norm": 1.6898986761036998, + "grad_norm": 1.6840536961034014, "learning_rate": 9.429889085780559e-06, - "loss": 0.2697, + "loss": 0.2706, "step": 1122 }, { "epoch": 0.15362517099863202, - "grad_norm": 1.2700720250242301, + "grad_norm": 1.2376656089323466, "learning_rate": 9.428892203873592e-06, - "loss": 0.195, + "loss": 0.1943, "step": 1123 }, { "epoch": 0.15376196990424076, - "grad_norm": 1.6238149368369126, + "grad_norm": 1.559326296973816, "learning_rate": 9.427894503953836e-06, - "loss": 0.2303, + "loss": 0.2316, "step": 1124 }, { "epoch": 0.1538987688098495, - "grad_norm": 1.7221301410150534, + "grad_norm": 1.6589195067144729, "learning_rate": 9.426895986205562e-06, - "loss": 0.2396, + "loss": 0.2385, "step": 1125 }, { "epoch": 0.1540355677154583, - "grad_norm": 1.3835557156002742, + "grad_norm": 1.3472138517275587, "learning_rate": 9.425896650813196e-06, - "loss": 0.2272, + "loss": 0.2282, "step": 1126 }, { "epoch": 0.15417236662106704, - "grad_norm": 1.7190108336935916, + "grad_norm": 1.6241478002790006, "learning_rate": 9.424896497961316e-06, - "loss": 0.2479, + "loss": 0.2471, "step": 1127 }, { "epoch": 0.15430916552667578, - "grad_norm": 1.535109945190163, + "grad_norm": 1.4833506083774155, "learning_rate": 9.423895527834649e-06, - "loss": 0.1938, + "loss": 0.1915, "step": 1128 }, { "epoch": 0.15444596443228453, - "grad_norm": 1.3704926324847335, + "grad_norm": 1.3218153091755405, "learning_rate": 9.42289374061807e-06, - "loss": 0.1834, + "loss": 0.1817, "step": 1129 }, { "epoch": 0.1545827633378933, - "grad_norm": 1.5141182347869901, + "grad_norm": 1.4763639490429428, "learning_rate": 9.421891136496612e-06, - "loss": 0.2273, + "loss": 0.2269, "step": 1130 }, { "epoch": 0.15471956224350206, - "grad_norm": 1.5511334611665175, + "grad_norm": 1.515191957850312, "learning_rate": 9.420887715655456e-06, - "loss": 0.2046, + "loss": 0.2038, "step": 1131 }, { "epoch": 0.1548563611491108, - "grad_norm": 1.3891328386263153, + "grad_norm": 1.3828810023609757, "learning_rate": 9.41988347827993e-06, - "loss": 0.1855, + "loss": 0.1876, "step": 1132 }, { "epoch": 0.15499316005471955, - "grad_norm": 1.8576090937933838, + "grad_norm": 1.7921116618328266, "learning_rate": 9.418878424555517e-06, - "loss": 0.2994, + "loss": 0.2996, "step": 1133 }, { "epoch": 0.15512995896032833, - "grad_norm": 1.4590086452567894, + "grad_norm": 1.406844684913585, "learning_rate": 9.41787255466785e-06, - "loss": 0.2108, + "loss": 0.2078, "step": 1134 }, { "epoch": 0.15526675786593708, - "grad_norm": 1.6814213585047322, + "grad_norm": 1.5958539622346544, "learning_rate": 9.416865868802713e-06, - "loss": 0.245, + "loss": 0.2412, "step": 1135 }, { "epoch": 0.15540355677154583, - "grad_norm": 1.2997657967557055, + "grad_norm": 1.2966137954082682, "learning_rate": 9.415858367146037e-06, - "loss": 0.1893, + "loss": 0.1924, "step": 1136 }, { "epoch": 0.15554035567715457, - "grad_norm": 1.8679769007763485, + "grad_norm": 1.8121837194106964, "learning_rate": 9.41485004988391e-06, - "loss": 0.2799, + "loss": 0.2797, "step": 1137 }, { "epoch": 0.15567715458276335, - "grad_norm": 1.5083582331674907, + "grad_norm": 1.4603256722095415, "learning_rate": 9.413840917202565e-06, - "loss": 0.1912, + "loss": 0.1906, "step": 1138 }, { "epoch": 0.1558139534883721, - "grad_norm": 1.7265371668452605, + "grad_norm": 1.7390807557673431, "learning_rate": 9.412830969288392e-06, - "loss": 0.2312, + "loss": 0.2365, "step": 1139 }, { "epoch": 0.15595075239398085, - "grad_norm": 1.6621031977124965, + "grad_norm": 1.63215789372762, "learning_rate": 9.411820206327922e-06, - "loss": 0.2751, + "loss": 0.2773, "step": 1140 }, { "epoch": 0.1560875512995896, - "grad_norm": 1.442075701766933, + "grad_norm": 1.4964879167836767, "learning_rate": 9.410808628507846e-06, - "loss": 0.2046, + "loss": 0.2015, "step": 1141 }, { "epoch": 0.15622435020519837, - "grad_norm": 1.5579799316181635, + "grad_norm": 1.5142073368184181, "learning_rate": 9.409796236015e-06, - "loss": 0.2409, + "loss": 0.2413, "step": 1142 }, { "epoch": 0.15636114911080712, - "grad_norm": 1.5389516698591916, + "grad_norm": 1.4842197884538553, "learning_rate": 9.408783029036373e-06, - "loss": 0.219, + "loss": 0.2178, "step": 1143 }, { "epoch": 0.15649794801641587, - "grad_norm": 1.6269143910857016, + "grad_norm": 1.5616910270057844, "learning_rate": 9.407769007759101e-06, - "loss": 0.2129, + "loss": 0.2108, "step": 1144 }, { "epoch": 0.15663474692202461, - "grad_norm": 1.8544948718831467, + "grad_norm": 1.7906124737970632, "learning_rate": 9.406754172370478e-06, - "loss": 0.2254, + "loss": 0.2215, "step": 1145 }, { "epoch": 0.1567715458276334, - "grad_norm": 1.5273045185466516, + "grad_norm": 1.4931373346304166, "learning_rate": 9.405738523057938e-06, - "loss": 0.2295, + "loss": 0.228, "step": 1146 }, { "epoch": 0.15690834473324214, - "grad_norm": 5.34099400284822, + "grad_norm": 1.5745745702841756, "learning_rate": 9.404722060009074e-06, - "loss": 0.2627, + "loss": 0.2436, "step": 1147 }, { "epoch": 0.1570451436388509, - "grad_norm": 1.4583840325115436, + "grad_norm": 1.4322229752208715, "learning_rate": 9.403704783411625e-06, - "loss": 0.1888, + "loss": 0.1926, "step": 1148 }, { "epoch": 0.15718194254445964, - "grad_norm": 1.517742427277904, + "grad_norm": 1.466783640029785, "learning_rate": 9.402686693453479e-06, - "loss": 0.2141, + "loss": 0.2109, "step": 1149 }, { "epoch": 0.1573187414500684, - "grad_norm": 1.4641113376130017, + "grad_norm": 1.3787664862757822, "learning_rate": 9.40166779032268e-06, - "loss": 0.2107, + "loss": 0.2067, "step": 1150 }, { "epoch": 0.15745554035567716, - "grad_norm": 1.724990530231924, + "grad_norm": 1.6909309542822455, "learning_rate": 9.400648074207415e-06, - "loss": 0.201, + "loss": 0.1979, "step": 1151 }, { "epoch": 0.1575923392612859, - "grad_norm": 1.6233705368040188, + "grad_norm": 1.6135509910573635, "learning_rate": 9.399627545296028e-06, - "loss": 0.2347, + "loss": 0.2361, "step": 1152 }, { "epoch": 0.15772913816689466, - "grad_norm": 1.6349895019675864, + "grad_norm": 1.6005299615170612, "learning_rate": 9.398606203777008e-06, - "loss": 0.2355, + "loss": 0.237, "step": 1153 }, { "epoch": 0.15786593707250343, - "grad_norm": 1.4894729122499792, + "grad_norm": 1.4838709428602788, "learning_rate": 9.397584049838996e-06, - "loss": 0.1786, + "loss": 0.1806, "step": 1154 }, { "epoch": 0.15800273597811218, - "grad_norm": 2.0367964535844827, + "grad_norm": 1.954652353582029, "learning_rate": 9.396561083670783e-06, - "loss": 0.2617, + "loss": 0.2576, "step": 1155 }, { "epoch": 0.15813953488372093, - "grad_norm": 1.5234786452689844, + "grad_norm": 1.4791303331553314, "learning_rate": 9.395537305461312e-06, - "loss": 0.184, + "loss": 0.1817, "step": 1156 }, { "epoch": 0.15827633378932968, - "grad_norm": 1.7750716539012223, + "grad_norm": 1.710446181519733, "learning_rate": 9.394512715399671e-06, - "loss": 0.2667, + "loss": 0.2578, "step": 1157 }, { "epoch": 0.15841313269493845, - "grad_norm": 1.4994859989162328, + "grad_norm": 1.4560392994655877, "learning_rate": 9.393487313675103e-06, - "loss": 0.2346, + "loss": 0.2343, "step": 1158 }, { "epoch": 0.1585499316005472, - "grad_norm": 1.4447108282052217, + "grad_norm": 1.4419467541619073, "learning_rate": 9.392461100476997e-06, - "loss": 0.1953, + "loss": 0.1965, "step": 1159 }, { "epoch": 0.15868673050615595, - "grad_norm": 1.5704076045599582, + "grad_norm": 1.5064961546812197, "learning_rate": 9.391434075994896e-06, - "loss": 0.2201, + "loss": 0.2187, "step": 1160 }, { "epoch": 0.1588235294117647, - "grad_norm": 1.3889412162431578, + "grad_norm": 1.3496798834596018, "learning_rate": 9.39040624041849e-06, - "loss": 0.2107, + "loss": 0.2082, "step": 1161 }, { "epoch": 0.15896032831737347, - "grad_norm": 1.4963872674397671, + "grad_norm": 1.480741126910405, "learning_rate": 9.389377593937618e-06, - "loss": 0.2061, + "loss": 0.2059, "step": 1162 }, { "epoch": 0.15909712722298222, - "grad_norm": 1.5621833657268624, + "grad_norm": 1.5436695572476788, "learning_rate": 9.388348136742272e-06, - "loss": 0.2383, + "loss": 0.2413, "step": 1163 }, { "epoch": 0.15923392612859097, - "grad_norm": 1.3525328658344795, + "grad_norm": 1.3524174857665041, "learning_rate": 9.38731786902259e-06, - "loss": 0.1813, + "loss": 0.184, "step": 1164 }, { "epoch": 0.15937072503419972, - "grad_norm": 1.7045651282814431, + "grad_norm": 1.6828822170750133, "learning_rate": 9.386286790968863e-06, - "loss": 0.2312, + "loss": 0.2325, "step": 1165 }, { "epoch": 0.1595075239398085, - "grad_norm": 1.3528645140991338, + "grad_norm": 1.3301929096252243, "learning_rate": 9.38525490277153e-06, - "loss": 0.2413, + "loss": 0.239, "step": 1166 }, { "epoch": 0.15964432284541724, - "grad_norm": 1.3422625031145343, + "grad_norm": 1.3301607784715828, "learning_rate": 9.38422220462118e-06, - "loss": 0.1943, + "loss": 0.1921, "step": 1167 }, { "epoch": 0.159781121751026, - "grad_norm": 1.2589975930959958, + "grad_norm": 1.2080909912772833, "learning_rate": 9.383188696708552e-06, - "loss": 0.1649, + "loss": 0.1629, "step": 1168 }, { "epoch": 0.15991792065663474, - "grad_norm": 1.6711797928254675, + "grad_norm": 1.444264236811869, "learning_rate": 9.382154379224534e-06, - "loss": 0.256, + "loss": 0.2582, "step": 1169 }, { "epoch": 0.1600547195622435, - "grad_norm": 1.499371626198514, + "grad_norm": 1.4857473895816773, "learning_rate": 9.38111925236016e-06, - "loss": 0.229, + "loss": 0.2289, "step": 1170 }, { "epoch": 0.16019151846785226, - "grad_norm": 1.523757726340922, + "grad_norm": 1.490768346021278, "learning_rate": 9.380083316306621e-06, - "loss": 0.2421, + "loss": 0.2424, "step": 1171 }, { "epoch": 0.160328317373461, - "grad_norm": 1.3896614803300034, + "grad_norm": 1.3629749495908403, "learning_rate": 9.379046571255254e-06, - "loss": 0.2131, + "loss": 0.2122, "step": 1172 }, { "epoch": 0.16046511627906976, - "grad_norm": 1.3538658496432188, + "grad_norm": 1.322017842393376, "learning_rate": 9.378009017397542e-06, - "loss": 0.2065, + "loss": 0.2063, "step": 1173 }, { "epoch": 0.16060191518467853, - "grad_norm": 1.5525034141722134, + "grad_norm": 1.5539292898083967, "learning_rate": 9.376970654925124e-06, - "loss": 0.2484, + "loss": 0.2525, "step": 1174 }, { "epoch": 0.16073871409028728, - "grad_norm": 1.7093441082998073, + "grad_norm": 1.6848715507028849, "learning_rate": 9.37593148402978e-06, - "loss": 0.2102, + "loss": 0.2124, "step": 1175 }, { "epoch": 0.16087551299589603, - "grad_norm": 1.6803954184375884, + "grad_norm": 1.6157273477809795, "learning_rate": 9.374891504903449e-06, - "loss": 0.2587, + "loss": 0.2544, "step": 1176 }, { "epoch": 0.16101231190150478, - "grad_norm": 1.3559005512947098, + "grad_norm": 1.334923247991742, "learning_rate": 9.373850717738211e-06, - "loss": 0.2331, + "loss": 0.2308, "step": 1177 }, { "epoch": 0.16114911080711355, - "grad_norm": 1.1734696312698594, + "grad_norm": 1.172780615805379, "learning_rate": 9.372809122726298e-06, - "loss": 0.201, + "loss": 0.2017, "step": 1178 }, { "epoch": 0.1612859097127223, - "grad_norm": 1.2357631943013063, + "grad_norm": 1.2305859161615467, "learning_rate": 9.371766720060095e-06, - "loss": 0.1858, + "loss": 0.1864, "step": 1179 }, { "epoch": 0.16142270861833105, - "grad_norm": 1.635597562472238, + "grad_norm": 1.6083493362245977, "learning_rate": 9.370723509932131e-06, - "loss": 0.2517, + "loss": 0.2524, "step": 1180 }, { "epoch": 0.1615595075239398, - "grad_norm": 1.389047252263227, + "grad_norm": 1.3753615720708616, "learning_rate": 9.369679492535087e-06, - "loss": 0.1988, + "loss": 0.201, "step": 1181 }, { "epoch": 0.16169630642954858, - "grad_norm": 1.572795399662335, + "grad_norm": 1.5431540451618406, "learning_rate": 9.36863466806179e-06, - "loss": 0.2399, + "loss": 0.2448, "step": 1182 }, { "epoch": 0.16183310533515732, - "grad_norm": 1.5122669743736075, + "grad_norm": 1.4915517482961171, "learning_rate": 9.36758903670522e-06, - "loss": 0.2475, + "loss": 0.2498, "step": 1183 }, { "epoch": 0.16196990424076607, - "grad_norm": 1.5824291540290918, + "grad_norm": 1.5494333566487302, "learning_rate": 9.366542598658505e-06, - "loss": 0.2944, + "loss": 0.2951, "step": 1184 }, { "epoch": 0.16210670314637482, - "grad_norm": 1.1844490978059832, + "grad_norm": 1.1614870765516698, "learning_rate": 9.365495354114919e-06, - "loss": 0.1846, + "loss": 0.1858, "step": 1185 }, { "epoch": 0.1622435020519836, - "grad_norm": 1.587171495015454, + "grad_norm": 1.519676635746227, "learning_rate": 9.36444730326789e-06, - "loss": 0.2611, + "loss": 0.2638, "step": 1186 }, { "epoch": 0.16238030095759234, - "grad_norm": 1.555800930287698, + "grad_norm": 1.5536315934820415, "learning_rate": 9.36339844631099e-06, - "loss": 0.1965, + "loss": 0.1964, "step": 1187 }, { "epoch": 0.1625170998632011, - "grad_norm": 1.2952733084347057, + "grad_norm": 1.2922699316402055, "learning_rate": 9.36234878343794e-06, - "loss": 0.1917, + "loss": 0.194, "step": 1188 }, { "epoch": 0.16265389876880984, - "grad_norm": 1.358522617114957, + "grad_norm": 1.776138088263764, "learning_rate": 9.361298314842617e-06, - "loss": 0.193, + "loss": 0.1946, "step": 1189 }, { "epoch": 0.16279069767441862, - "grad_norm": 1.4774478817968792, + "grad_norm": 1.426754319325227, "learning_rate": 9.36024704071904e-06, - "loss": 0.1965, + "loss": 0.1963, "step": 1190 }, { "epoch": 0.16292749658002736, - "grad_norm": 1.7881989141443895, + "grad_norm": 1.6877199296742016, "learning_rate": 9.359194961261375e-06, - "loss": 0.2527, + "loss": 0.2519, "step": 1191 }, { "epoch": 0.1630642954856361, - "grad_norm": 1.3600384953334956, + "grad_norm": 1.3517359720723356, "learning_rate": 9.358142076663943e-06, - "loss": 0.2124, + "loss": 0.2099, "step": 1192 }, { "epoch": 0.16320109439124486, - "grad_norm": 1.364579246130194, + "grad_norm": 1.4071727682997406, "learning_rate": 9.357088387121212e-06, - "loss": 0.2041, + "loss": 0.2072, "step": 1193 }, { "epoch": 0.16333789329685364, - "grad_norm": 1.2063781109961933, + "grad_norm": 1.222707119340245, "learning_rate": 9.356033892827796e-06, - "loss": 0.1564, + "loss": 0.1611, "step": 1194 }, { "epoch": 0.16347469220246238, - "grad_norm": 1.0951175172347, + "grad_norm": 1.161787341024978, "learning_rate": 9.35497859397846e-06, - "loss": 0.15, + "loss": 0.1478, "step": 1195 }, { "epoch": 0.16361149110807113, - "grad_norm": 1.5161583656966149, + "grad_norm": 1.4514368262226776, "learning_rate": 9.353922490768115e-06, - "loss": 0.207, + "loss": 0.2024, "step": 1196 }, { "epoch": 0.16374829001367988, - "grad_norm": 1.5098310521384721, + "grad_norm": 1.515536007283429, "learning_rate": 9.352865583391826e-06, - "loss": 0.2111, + "loss": 0.2121, "step": 1197 }, { "epoch": 0.16388508891928866, - "grad_norm": 1.5709399255745353, + "grad_norm": 1.5118680144849759, "learning_rate": 9.3518078720448e-06, - "loss": 0.2404, + "loss": 0.2344, "step": 1198 }, { "epoch": 0.1640218878248974, - "grad_norm": 1.5366270597335931, + "grad_norm": 1.4930155708168091, "learning_rate": 9.350749356922395e-06, - "loss": 0.2225, + "loss": 0.219, "step": 1199 }, { "epoch": 0.16415868673050615, - "grad_norm": 1.6190006136010542, + "grad_norm": 1.5118303535593365, "learning_rate": 9.349690038220121e-06, - "loss": 0.2056, + "loss": 0.2012, "step": 1200 }, { "epoch": 0.16415868673050615, - "eval_loss": 0.2129599004983902, - "eval_runtime": 5.921, - "eval_samples_per_second": 5.067, + "eval_loss": 0.2130243182182312, + "eval_runtime": 5.9235, + "eval_samples_per_second": 5.065, "eval_steps_per_second": 1.351, "step": 1200 }, { "epoch": 0.1642954856361149, - "grad_norm": 1.369991472016975, + "grad_norm": 1.362229449661951, "learning_rate": 9.348629916133629e-06, - "loss": 0.2041, + "loss": 0.202, "step": 1201 }, { "epoch": 0.16443228454172368, - "grad_norm": 1.4537359689697138, + "grad_norm": 1.4089523497762204, "learning_rate": 9.347568990858726e-06, - "loss": 0.2004, + "loss": 0.2015, "step": 1202 }, { "epoch": 0.16456908344733243, - "grad_norm": 1.167462778225301, + "grad_norm": 1.154626168768556, "learning_rate": 9.346507262591364e-06, - "loss": 0.1787, + "loss": 0.1785, "step": 1203 }, { "epoch": 0.16470588235294117, - "grad_norm": 1.5301484444986426, + "grad_norm": 1.5066296483266601, "learning_rate": 9.345444731527642e-06, - "loss": 0.2115, + "loss": 0.2095, "step": 1204 }, { "epoch": 0.16484268125854992, - "grad_norm": 1.4732285045785822, + "grad_norm": 1.4403852433257291, "learning_rate": 9.34438139786381e-06, - "loss": 0.2259, + "loss": 0.2302, "step": 1205 }, { "epoch": 0.1649794801641587, - "grad_norm": 1.694167349766416, + "grad_norm": 1.6811571055299919, "learning_rate": 9.343317261796262e-06, - "loss": 0.2219, + "loss": 0.2221, "step": 1206 }, { "epoch": 0.16511627906976745, - "grad_norm": 1.5198185817359062, + "grad_norm": 1.4893981698522485, "learning_rate": 9.342252323521546e-06, - "loss": 0.2322, + "loss": 0.2326, "step": 1207 }, { "epoch": 0.1652530779753762, - "grad_norm": 1.2211638793296447, + "grad_norm": 1.2106010373116693, "learning_rate": 9.341186583236355e-06, - "loss": 0.1674, + "loss": 0.1663, "step": 1208 }, { "epoch": 0.16538987688098494, - "grad_norm": 1.589790598511794, + "grad_norm": 1.5727512830841304, "learning_rate": 9.340120041137528e-06, - "loss": 0.271, + "loss": 0.2703, "step": 1209 }, { "epoch": 0.16552667578659372, - "grad_norm": 1.532650601017962, + "grad_norm": 1.4687246092076853, "learning_rate": 9.339052697422057e-06, - "loss": 0.2032, + "loss": 0.2013, "step": 1210 }, { "epoch": 0.16566347469220247, - "grad_norm": 1.2869884500408235, + "grad_norm": 1.29396691133198, "learning_rate": 9.337984552287078e-06, - "loss": 0.1953, + "loss": 0.1937, "step": 1211 }, { "epoch": 0.16580027359781122, - "grad_norm": 1.4605953079847251, + "grad_norm": 1.3886334062867798, "learning_rate": 9.336915605929879e-06, - "loss": 0.2161, + "loss": 0.2151, "step": 1212 }, { "epoch": 0.16593707250341996, - "grad_norm": 1.1826185008863999, + "grad_norm": 1.1615000461434175, "learning_rate": 9.33584585854789e-06, - "loss": 0.1846, + "loss": 0.1832, "step": 1213 }, { "epoch": 0.16607387140902874, - "grad_norm": 1.3046156854119635, + "grad_norm": 1.3110926148399886, "learning_rate": 9.334775310338695e-06, - "loss": 0.1854, + "loss": 0.1869, "step": 1214 }, { "epoch": 0.1662106703146375, - "grad_norm": 1.4146532466236097, + "grad_norm": 1.3869924045582065, "learning_rate": 9.333703961500021e-06, - "loss": 0.2453, + "loss": 0.2415, "step": 1215 }, { "epoch": 0.16634746922024624, - "grad_norm": 1.7291616402388645, + "grad_norm": 1.669875723959687, "learning_rate": 9.332631812229748e-06, - "loss": 0.2668, + "loss": 0.2592, "step": 1216 }, { "epoch": 0.16648426812585498, - "grad_norm": 1.3892016320692224, + "grad_norm": 1.324999637688209, "learning_rate": 9.3315588627259e-06, - "loss": 0.1967, + "loss": 0.1913, "step": 1217 }, { "epoch": 0.16662106703146376, - "grad_norm": 2.0892167370546653, + "grad_norm": 2.0087360475481146, "learning_rate": 9.33048511318665e-06, - "loss": 0.2646, + "loss": 0.2606, "step": 1218 }, { "epoch": 0.1667578659370725, - "grad_norm": 1.4728318061993115, + "grad_norm": 1.424862879918676, "learning_rate": 9.329410563810317e-06, - "loss": 0.2022, + "loss": 0.2009, "step": 1219 }, { "epoch": 0.16689466484268126, - "grad_norm": 1.436831551842231, + "grad_norm": 1.3832911826339107, "learning_rate": 9.328335214795373e-06, - "loss": 0.1931, + "loss": 0.1932, "step": 1220 }, { "epoch": 0.16703146374829, - "grad_norm": 1.8871314018220435, + "grad_norm": 1.8338786880976827, "learning_rate": 9.327259066340428e-06, - "loss": 0.2846, + "loss": 0.2811, "step": 1221 }, { "epoch": 0.16716826265389878, - "grad_norm": 1.4784139863667909, + "grad_norm": 1.4545935383909434, "learning_rate": 9.326182118644254e-06, - "loss": 0.1943, + "loss": 0.1971, "step": 1222 }, { "epoch": 0.16730506155950753, - "grad_norm": 2.0445272894552247, + "grad_norm": 2.1325927696893303, "learning_rate": 9.325104371905756e-06, - "loss": 0.3151, + "loss": 0.3166, "step": 1223 }, { "epoch": 0.16744186046511628, - "grad_norm": 1.2803209384462249, + "grad_norm": 1.247567127692154, "learning_rate": 9.324025826323995e-06, - "loss": 0.1855, + "loss": 0.1849, "step": 1224 }, { "epoch": 0.16757865937072502, - "grad_norm": 1.441421462206133, + "grad_norm": 1.4018525198757257, "learning_rate": 9.322946482098178e-06, - "loss": 0.2397, + "loss": 0.2363, "step": 1225 }, { "epoch": 0.1677154582763338, - "grad_norm": 1.5294379568778436, + "grad_norm": 1.5307953698001675, "learning_rate": 9.321866339427657e-06, - "loss": 0.24, + "loss": 0.2418, "step": 1226 }, { "epoch": 0.16785225718194255, - "grad_norm": 1.521627264663115, + "grad_norm": 1.4808376140010633, "learning_rate": 9.320785398511937e-06, - "loss": 0.226, + "loss": 0.2268, "step": 1227 }, { "epoch": 0.1679890560875513, - "grad_norm": 1.7065485215585066, + "grad_norm": 1.556370805845021, "learning_rate": 9.319703659550663e-06, - "loss": 0.2238, + "loss": 0.2172, "step": 1228 }, { "epoch": 0.16812585499316005, - "grad_norm": 1.5543063050923511, + "grad_norm": 1.5383241609638025, "learning_rate": 9.318621122743636e-06, - "loss": 0.2041, + "loss": 0.2034, "step": 1229 }, { "epoch": 0.16826265389876882, - "grad_norm": 1.5775260168008292, + "grad_norm": 1.5762653366892754, "learning_rate": 9.317537788290794e-06, - "loss": 0.2244, + "loss": 0.2272, "step": 1230 }, { "epoch": 0.16839945280437757, - "grad_norm": 1.1538620045383916, + "grad_norm": 1.1185689211651957, "learning_rate": 9.316453656392232e-06, - "loss": 0.1748, + "loss": 0.171, "step": 1231 }, { "epoch": 0.16853625170998632, - "grad_norm": 1.4701128434457025, + "grad_norm": 1.4333251050358493, "learning_rate": 9.315368727248187e-06, - "loss": 0.2199, + "loss": 0.2188, "step": 1232 }, { "epoch": 0.16867305061559507, - "grad_norm": 1.4826266367092908, + "grad_norm": 1.4288010238341475, "learning_rate": 9.314283001059044e-06, - "loss": 0.1963, + "loss": 0.1913, "step": 1233 }, { "epoch": 0.16880984952120384, - "grad_norm": 1.575547252799686, + "grad_norm": 1.5366257815035267, "learning_rate": 9.313196478025337e-06, - "loss": 0.227, + "loss": 0.2273, "step": 1234 }, { "epoch": 0.1689466484268126, - "grad_norm": 1.5455672446418298, + "grad_norm": 1.5328535905241356, "learning_rate": 9.312109158347746e-06, - "loss": 0.2443, + "loss": 0.2486, "step": 1235 }, { "epoch": 0.16908344733242134, - "grad_norm": 1.2075623539742768, + "grad_norm": 1.1911887174086175, "learning_rate": 9.311021042227095e-06, - "loss": 0.1734, + "loss": 0.1735, "step": 1236 }, { "epoch": 0.1692202462380301, - "grad_norm": 1.5940316371535754, + "grad_norm": 1.473442405105052, "learning_rate": 9.309932129864364e-06, - "loss": 0.2275, + "loss": 0.2263, "step": 1237 }, { "epoch": 0.16935704514363886, - "grad_norm": 1.4921772631681287, + "grad_norm": 1.4570979858930633, "learning_rate": 9.308842421460668e-06, - "loss": 0.1773, + "loss": 0.1735, "step": 1238 }, { "epoch": 0.1694938440492476, - "grad_norm": 1.526883483572539, + "grad_norm": 1.510661747907943, "learning_rate": 9.307751917217278e-06, - "loss": 0.2558, + "loss": 0.2556, "step": 1239 }, { "epoch": 0.16963064295485636, - "grad_norm": 1.7517149610668785, + "grad_norm": 1.6812419499762954, "learning_rate": 9.30666061733561e-06, - "loss": 0.2354, + "loss": 0.2322, "step": 1240 }, { "epoch": 0.1697674418604651, - "grad_norm": 1.3831568159696914, + "grad_norm": 1.311277412240786, "learning_rate": 9.305568522017227e-06, - "loss": 0.1875, + "loss": 0.1849, "step": 1241 }, { "epoch": 0.16990424076607388, - "grad_norm": 1.2877216156890552, + "grad_norm": 1.2619639284196074, "learning_rate": 9.304475631463833e-06, - "loss": 0.1644, + "loss": 0.1629, "step": 1242 }, { "epoch": 0.17004103967168263, - "grad_norm": 1.4711155665930737, + "grad_norm": 1.3878490222028608, "learning_rate": 9.30338194587729e-06, - "loss": 0.2257, + "loss": 0.2262, "step": 1243 }, { "epoch": 0.17017783857729138, - "grad_norm": 1.7579849060987653, + "grad_norm": 1.729002437707095, "learning_rate": 9.302287465459599e-06, - "loss": 0.2469, + "loss": 0.2531, "step": 1244 }, { "epoch": 0.17031463748290013, - "grad_norm": 1.519013534628901, + "grad_norm": 1.59169641936534, "learning_rate": 9.301192190412908e-06, - "loss": 0.2355, + "loss": 0.2438, "step": 1245 }, { "epoch": 0.1704514363885089, - "grad_norm": 1.7533615196888186, + "grad_norm": 1.735325529546744, "learning_rate": 9.300096120939515e-06, - "loss": 0.2921, + "loss": 0.2893, "step": 1246 }, { "epoch": 0.17058823529411765, - "grad_norm": 1.2759257695655797, + "grad_norm": 1.2565007158867265, "learning_rate": 9.298999257241862e-06, - "loss": 0.1744, + "loss": 0.1736, "step": 1247 }, { "epoch": 0.1707250341997264, - "grad_norm": 1.4417960452818919, + "grad_norm": 1.3766236008425974, "learning_rate": 9.297901599522541e-06, - "loss": 0.2275, + "loss": 0.2245, "step": 1248 }, { "epoch": 0.17086183310533515, - "grad_norm": 1.5700746070265177, + "grad_norm": 1.5033177557441604, "learning_rate": 9.296803147984286e-06, - "loss": 0.2006, + "loss": 0.1998, "step": 1249 }, { "epoch": 0.17099863201094392, - "grad_norm": 1.4554906081734649, + "grad_norm": 1.4009404952817817, "learning_rate": 9.29570390282998e-06, - "loss": 0.2, + "loss": 0.2009, "step": 1250 }, { "epoch": 0.17113543091655267, - "grad_norm": 1.4036693427508382, + "grad_norm": 1.3743723446153255, "learning_rate": 9.294603864262655e-06, - "loss": 0.1932, + "loss": 0.1935, "step": 1251 }, { "epoch": 0.17127222982216142, - "grad_norm": 1.5276250388391444, + "grad_norm": 1.5086269051995265, "learning_rate": 9.293503032485485e-06, - "loss": 0.2189, + "loss": 0.2188, "step": 1252 }, { "epoch": 0.17140902872777017, - "grad_norm": 1.5156115856719792, + "grad_norm": 1.4395200191660518, "learning_rate": 9.292401407701795e-06, - "loss": 0.2507, + "loss": 0.2449, "step": 1253 }, { "epoch": 0.17154582763337894, - "grad_norm": 1.5618507056696613, + "grad_norm": 1.5496366826430865, "learning_rate": 9.29129899011505e-06, - "loss": 0.2001, + "loss": 0.1982, "step": 1254 }, { "epoch": 0.1716826265389877, - "grad_norm": 1.6642861987271624, + "grad_norm": 1.6415705161619887, "learning_rate": 9.290195779928872e-06, - "loss": 0.252, + "loss": 0.2579, "step": 1255 }, { "epoch": 0.17181942544459644, - "grad_norm": 1.1829902464870172, + "grad_norm": 1.165004370033264, "learning_rate": 9.289091777347017e-06, - "loss": 0.1753, + "loss": 0.1756, "step": 1256 }, { "epoch": 0.1719562243502052, - "grad_norm": 1.4972937376320943, + "grad_norm": 1.4388234594570792, "learning_rate": 9.287986982573398e-06, - "loss": 0.2247, + "loss": 0.2271, "step": 1257 }, { "epoch": 0.17209302325581396, - "grad_norm": 1.5810909662502426, + "grad_norm": 1.5298216603640933, "learning_rate": 9.286881395812066e-06, - "loss": 0.2122, + "loss": 0.2115, "step": 1258 }, { "epoch": 0.1722298221614227, - "grad_norm": 1.5452339642109056, + "grad_norm": 1.5395939291815213, "learning_rate": 9.285775017267224e-06, - "loss": 0.2264, + "loss": 0.2311, "step": 1259 }, { "epoch": 0.17236662106703146, - "grad_norm": 1.6748951253232867, + "grad_norm": 1.5465611485198776, "learning_rate": 9.28466784714322e-06, - "loss": 0.2284, + "loss": 0.2241, "step": 1260 }, { "epoch": 0.1725034199726402, - "grad_norm": 1.5236171770746, + "grad_norm": 1.5047073074302229, "learning_rate": 9.283559885644546e-06, - "loss": 0.1904, + "loss": 0.1916, "step": 1261 }, { "epoch": 0.17264021887824899, - "grad_norm": 1.7570580804674794, + "grad_norm": 1.7445348524181152, "learning_rate": 9.28245113297584e-06, - "loss": 0.2121, + "loss": 0.2154, "step": 1262 }, { "epoch": 0.17277701778385773, - "grad_norm": 1.2395886272941887, + "grad_norm": 1.2213846541932478, "learning_rate": 9.28134158934189e-06, - "loss": 0.207, + "loss": 0.2049, "step": 1263 }, { "epoch": 0.17291381668946648, - "grad_norm": 1.8599300408748038, + "grad_norm": 1.2799664124305812, "learning_rate": 9.28023125494763e-06, - "loss": 0.1778, + "loss": 0.1746, "step": 1264 }, { "epoch": 0.17305061559507523, - "grad_norm": 1.516207673512894, + "grad_norm": 1.4960099735965946, "learning_rate": 9.279120129998131e-06, - "loss": 0.2097, + "loss": 0.2062, "step": 1265 }, { "epoch": 0.173187414500684, - "grad_norm": 1.4949076148482117, + "grad_norm": 1.4746024178436823, "learning_rate": 9.278008214698624e-06, - "loss": 0.1659, + "loss": 0.1681, "step": 1266 }, { "epoch": 0.17332421340629275, - "grad_norm": 1.77300691261497, + "grad_norm": 1.7256930569685953, "learning_rate": 9.276895509254476e-06, - "loss": 0.231, + "loss": 0.2326, "step": 1267 }, { "epoch": 0.1734610123119015, - "grad_norm": 1.8722152243727535, + "grad_norm": 1.8225342615337166, "learning_rate": 9.275782013871202e-06, - "loss": 0.2682, + "loss": 0.2662, "step": 1268 }, { "epoch": 0.17359781121751025, - "grad_norm": 1.6203959961602468, + "grad_norm": 1.5269589680886724, "learning_rate": 9.274667728754465e-06, - "loss": 0.1947, + "loss": 0.1916, "step": 1269 }, { "epoch": 0.17373461012311903, - "grad_norm": 1.4656202232567763, + "grad_norm": 1.4645580300925751, "learning_rate": 9.273552654110071e-06, - "loss": 0.1983, + "loss": 0.2034, "step": 1270 }, { "epoch": 0.17387140902872777, - "grad_norm": 1.3891947788628305, + "grad_norm": 1.3619761747399095, "learning_rate": 9.272436790143975e-06, - "loss": 0.2068, + "loss": 0.2047, "step": 1271 }, { "epoch": 0.17400820793433652, - "grad_norm": 1.1926006372042934, + "grad_norm": 1.1773301904207512, "learning_rate": 9.271320137062276e-06, - "loss": 0.1794, + "loss": 0.179, "step": 1272 }, { "epoch": 0.17414500683994527, - "grad_norm": 1.4262288538558823, + "grad_norm": 1.3890731906567364, "learning_rate": 9.270202695071218e-06, - "loss": 0.2096, + "loss": 0.2081, "step": 1273 }, { "epoch": 0.17428180574555405, - "grad_norm": 1.4330646223726673, + "grad_norm": 1.4135548615576239, "learning_rate": 9.26908446437719e-06, - "loss": 0.2281, + "loss": 0.2321, "step": 1274 }, { "epoch": 0.1744186046511628, - "grad_norm": 1.4452573134093012, + "grad_norm": 1.4278173928984061, "learning_rate": 9.267965445186733e-06, - "loss": 0.182, + "loss": 0.1813, "step": 1275 }, { "epoch": 0.17455540355677154, - "grad_norm": 1.2186249341508963, + "grad_norm": 1.2528898913996198, "learning_rate": 9.266845637706526e-06, - "loss": 0.1658, + "loss": 0.1689, "step": 1276 }, { "epoch": 0.1746922024623803, - "grad_norm": 1.4029176556297724, + "grad_norm": 1.404215973333751, "learning_rate": 9.265725042143396e-06, - "loss": 0.2072, + "loss": 0.2141, "step": 1277 }, { "epoch": 0.17482900136798907, - "grad_norm": 1.5391377463424558, + "grad_norm": 1.6801680309238216, "learning_rate": 9.264603658704317e-06, - "loss": 0.2058, + "loss": 0.2051, "step": 1278 }, { "epoch": 0.17496580027359782, - "grad_norm": 1.8371246421117193, + "grad_norm": 1.8478410030044852, "learning_rate": 9.263481487596408e-06, - "loss": 0.2507, + "loss": 0.2545, "step": 1279 }, { "epoch": 0.17510259917920656, - "grad_norm": 1.4985010385654283, + "grad_norm": 1.4356677069578567, "learning_rate": 9.262358529026934e-06, - "loss": 0.2409, + "loss": 0.2382, "step": 1280 }, { "epoch": 0.1752393980848153, - "grad_norm": 1.430710949177945, + "grad_norm": 1.4635675285418541, "learning_rate": 9.2612347832033e-06, - "loss": 0.2134, + "loss": 0.2122, "step": 1281 }, { "epoch": 0.1753761969904241, - "grad_norm": 1.3863932327767097, + "grad_norm": 1.3277359771591628, "learning_rate": 9.260110250333066e-06, - "loss": 0.1985, + "loss": 0.1962, "step": 1282 }, { "epoch": 0.17551299589603284, - "grad_norm": 1.5455076707211746, + "grad_norm": 1.522930288657675, "learning_rate": 9.25898493062393e-06, - "loss": 0.2222, + "loss": 0.2225, "step": 1283 }, { "epoch": 0.17564979480164158, - "grad_norm": 1.4573137442878301, + "grad_norm": 1.4452517251203314, "learning_rate": 9.257858824283738e-06, - "loss": 0.2197, + "loss": 0.2236, "step": 1284 }, { "epoch": 0.17578659370725033, - "grad_norm": 1.4458310340635079, + "grad_norm": 1.3936919648144983, "learning_rate": 9.256731931520481e-06, - "loss": 0.1739, + "loss": 0.1745, "step": 1285 }, { "epoch": 0.1759233926128591, - "grad_norm": 1.3352534247455103, + "grad_norm": 1.3084703357273655, "learning_rate": 9.255604252542296e-06, - "loss": 0.2004, + "loss": 0.1997, "step": 1286 }, { "epoch": 0.17606019151846786, - "grad_norm": 1.5693886092188, + "grad_norm": 1.5139756305541814, "learning_rate": 9.254475787557464e-06, - "loss": 0.213, + "loss": 0.2105, "step": 1287 }, { "epoch": 0.1761969904240766, - "grad_norm": 1.6286239589510798, + "grad_norm": 1.576269305867267, "learning_rate": 9.25334653677441e-06, - "loss": 0.229, + "loss": 0.2311, "step": 1288 }, { "epoch": 0.17633378932968535, - "grad_norm": 1.4938404997871821, + "grad_norm": 1.4549283003543176, "learning_rate": 9.252216500401706e-06, - "loss": 0.1994, + "loss": 0.2007, "step": 1289 }, { "epoch": 0.17647058823529413, - "grad_norm": 1.420379516035935, + "grad_norm": 1.3599931446666664, "learning_rate": 9.251085678648072e-06, - "loss": 0.2352, + "loss": 0.2345, "step": 1290 }, { "epoch": 0.17660738714090288, - "grad_norm": 1.6640842800362645, + "grad_norm": 1.6538807798202262, "learning_rate": 9.249954071722366e-06, - "loss": 0.2682, + "loss": 0.2679, "step": 1291 }, { "epoch": 0.17674418604651163, - "grad_norm": 1.360398914226448, + "grad_norm": 1.343998019896755, "learning_rate": 9.248821679833596e-06, - "loss": 0.2421, + "loss": 0.244, "step": 1292 }, { "epoch": 0.17688098495212037, - "grad_norm": 1.461676107576042, + "grad_norm": 1.435374239945721, "learning_rate": 9.247688503190915e-06, - "loss": 0.2343, + "loss": 0.2339, "step": 1293 }, { "epoch": 0.17701778385772915, - "grad_norm": 1.5607023300774965, + "grad_norm": 1.601686219975922, "learning_rate": 9.246554542003618e-06, - "loss": 0.2185, + "loss": 0.2114, "step": 1294 }, { "epoch": 0.1771545827633379, - "grad_norm": 1.3285206250395036, + "grad_norm": 1.296645154290606, "learning_rate": 9.245419796481148e-06, - "loss": 0.2122, + "loss": 0.2089, "step": 1295 }, { "epoch": 0.17729138166894665, - "grad_norm": 1.4707977142277768, + "grad_norm": 1.433946814706979, "learning_rate": 9.244284266833092e-06, - "loss": 0.2123, + "loss": 0.2109, "step": 1296 }, { "epoch": 0.1774281805745554, - "grad_norm": 1.5693301036469243, + "grad_norm": 1.574119514948042, "learning_rate": 9.24314795326918e-06, - "loss": 0.2554, + "loss": 0.2569, "step": 1297 }, { "epoch": 0.17756497948016417, - "grad_norm": 1.4881257550121272, + "grad_norm": 1.4680471313064416, "learning_rate": 9.24201085599929e-06, - "loss": 0.2123, + "loss": 0.2119, "step": 1298 }, { "epoch": 0.17770177838577292, - "grad_norm": 1.5608048254555529, + "grad_norm": 1.5511147590263743, "learning_rate": 9.240872975233442e-06, - "loss": 0.2432, + "loss": 0.2425, "step": 1299 }, { "epoch": 0.17783857729138167, - "grad_norm": 1.5829790829262222, + "grad_norm": 1.5535267581222727, "learning_rate": 9.2397343111818e-06, - "loss": 0.2045, + "loss": 0.2042, "step": 1300 }, { "epoch": 0.17783857729138167, - "eval_loss": 0.21455861628055573, - "eval_runtime": 5.932, - "eval_samples_per_second": 5.057, - "eval_steps_per_second": 1.349, + "eval_loss": 0.21434584259986877, + "eval_runtime": 5.9185, + "eval_samples_per_second": 5.069, + "eval_steps_per_second": 1.352, "step": 1300 }, { "epoch": 0.17797537619699041, - "grad_norm": 1.6926806352973232, + "grad_norm": 1.6695555737252685, "learning_rate": 9.238594864054676e-06, - "loss": 0.2201, + "loss": 0.2202, "step": 1301 }, { "epoch": 0.1781121751025992, - "grad_norm": 1.7093978214567809, + "grad_norm": 1.6817824196064823, "learning_rate": 9.237454634062527e-06, - "loss": 0.2294, + "loss": 0.226, "step": 1302 }, { "epoch": 0.17824897400820794, - "grad_norm": 1.1597349792566005, + "grad_norm": 1.161116240093073, "learning_rate": 9.236313621415946e-06, - "loss": 0.1785, + "loss": 0.1783, "step": 1303 }, { "epoch": 0.1783857729138167, - "grad_norm": 1.510045429366461, + "grad_norm": 1.5017811170585609, "learning_rate": 9.235171826325687e-06, - "loss": 0.2178, + "loss": 0.2171, "step": 1304 }, { "epoch": 0.17852257181942544, - "grad_norm": 1.507682144957636, + "grad_norm": 1.5029059814322339, "learning_rate": 9.234029249002629e-06, - "loss": 0.1902, + "loss": 0.1912, "step": 1305 }, { "epoch": 0.1786593707250342, - "grad_norm": 1.5379272753939626, + "grad_norm": 1.50921176240891, "learning_rate": 9.23288588965781e-06, - "loss": 0.2225, + "loss": 0.2218, "step": 1306 }, { "epoch": 0.17879616963064296, - "grad_norm": 1.3728082035050282, + "grad_norm": 1.371098127220154, "learning_rate": 9.231741748502408e-06, - "loss": 0.2054, + "loss": 0.2064, "step": 1307 }, { "epoch": 0.1789329685362517, - "grad_norm": 1.5045419986786726, + "grad_norm": 1.4904136167241941, "learning_rate": 9.230596825747741e-06, - "loss": 0.1953, + "loss": 0.1955, "step": 1308 }, { "epoch": 0.17906976744186046, - "grad_norm": 1.5399576854577013, + "grad_norm": 1.5311306793263353, "learning_rate": 9.229451121605279e-06, - "loss": 0.2441, + "loss": 0.2486, "step": 1309 }, { "epoch": 0.17920656634746923, - "grad_norm": 1.4475507113715083, + "grad_norm": 1.4269848442447182, "learning_rate": 9.228304636286634e-06, - "loss": 0.2444, + "loss": 0.2449, "step": 1310 }, { "epoch": 0.17934336525307798, - "grad_norm": 1.2601864982214985, + "grad_norm": 1.2578391642417956, "learning_rate": 9.227157370003557e-06, - "loss": 0.1665, + "loss": 0.1671, "step": 1311 }, { "epoch": 0.17948016415868673, - "grad_norm": 1.3403456653655308, + "grad_norm": 1.346550885554387, "learning_rate": 9.226009322967948e-06, - "loss": 0.2242, + "loss": 0.2277, "step": 1312 }, { "epoch": 0.17961696306429548, - "grad_norm": 1.3879291134851526, + "grad_norm": 1.3628935112439546, "learning_rate": 9.22486049539185e-06, - "loss": 0.2074, + "loss": 0.2077, "step": 1313 }, { "epoch": 0.17975376196990425, - "grad_norm": 1.8465173501803462, + "grad_norm": 1.8608589964939182, "learning_rate": 9.223710887487454e-06, - "loss": 0.2147, + "loss": 0.2162, "step": 1314 }, { "epoch": 0.179890560875513, - "grad_norm": 1.4567584821625144, + "grad_norm": 1.406293825629368, "learning_rate": 9.222560499467088e-06, - "loss": 0.1849, + "loss": 0.1848, "step": 1315 }, { "epoch": 0.18002735978112175, - "grad_norm": 1.4886012084476905, + "grad_norm": 1.4463831350787464, "learning_rate": 9.221409331543228e-06, - "loss": 0.1925, + "loss": 0.1923, "step": 1316 }, { "epoch": 0.1801641586867305, - "grad_norm": 1.4413809293406796, + "grad_norm": 1.4286398534737135, "learning_rate": 9.220257383928498e-06, - "loss": 0.1999, + "loss": 0.1968, "step": 1317 }, { "epoch": 0.18030095759233927, - "grad_norm": 1.464911710594197, + "grad_norm": 1.443652295450382, "learning_rate": 9.219104656835653e-06, - "loss": 0.2178, + "loss": 0.2184, "step": 1318 }, { "epoch": 0.18043775649794802, - "grad_norm": 1.5827004704292473, + "grad_norm": 1.52832188053187, "learning_rate": 9.21795115047761e-06, - "loss": 0.2261, + "loss": 0.2231, "step": 1319 }, { "epoch": 0.18057455540355677, - "grad_norm": 1.2800310745059411, + "grad_norm": 1.2403498380093492, "learning_rate": 9.216796865067417e-06, - "loss": 0.1781, + "loss": 0.1766, "step": 1320 }, { "epoch": 0.18071135430916552, - "grad_norm": 1.535953116668014, + "grad_norm": 1.511458641926071, "learning_rate": 9.21564180081827e-06, - "loss": 0.2169, + "loss": 0.219, "step": 1321 }, { "epoch": 0.1808481532147743, - "grad_norm": 1.5568770912121976, + "grad_norm": 1.5068512382925514, "learning_rate": 9.214485957943504e-06, - "loss": 0.2279, + "loss": 0.2278, "step": 1322 }, { "epoch": 0.18098495212038304, - "grad_norm": 1.342685817061193, + "grad_norm": 1.3046278310646704, "learning_rate": 9.213329336656609e-06, - "loss": 0.2189, + "loss": 0.2187, "step": 1323 }, { "epoch": 0.1811217510259918, - "grad_norm": 1.5590325194140526, + "grad_norm": 1.5538508034446346, "learning_rate": 9.21217193717121e-06, - "loss": 0.2369, + "loss": 0.2381, "step": 1324 }, { "epoch": 0.18125854993160054, - "grad_norm": 1.3503451611443638, + "grad_norm": 1.3284806175774435, "learning_rate": 9.211013759701075e-06, - "loss": 0.2364, + "loss": 0.2357, "step": 1325 }, { "epoch": 0.1813953488372093, - "grad_norm": 1.4068869469638758, + "grad_norm": 1.3564023532854121, "learning_rate": 9.209854804460121e-06, - "loss": 0.1979, + "loss": 0.1963, "step": 1326 }, { "epoch": 0.18153214774281806, - "grad_norm": 1.403953611693445, + "grad_norm": 1.3740786279425106, "learning_rate": 9.208695071662406e-06, - "loss": 0.1969, + "loss": 0.1948, "step": 1327 }, { "epoch": 0.1816689466484268, - "grad_norm": 1.5954749950139722, + "grad_norm": 1.562561287632486, "learning_rate": 9.207534561522131e-06, - "loss": 0.1956, + "loss": 0.1946, "step": 1328 }, { "epoch": 0.18180574555403556, - "grad_norm": 1.5089781163458322, + "grad_norm": 1.5230818663447805, "learning_rate": 9.206373274253642e-06, - "loss": 0.2403, + "loss": 0.2434, "step": 1329 }, { "epoch": 0.18194254445964433, - "grad_norm": 1.5621817892532215, + "grad_norm": 1.5553708723272615, "learning_rate": 9.205211210071425e-06, - "loss": 0.2498, + "loss": 0.2502, "step": 1330 }, { "epoch": 0.18207934336525308, - "grad_norm": 1.3191511430738645, + "grad_norm": 1.2800354556039255, "learning_rate": 9.204048369190116e-06, - "loss": 0.1953, + "loss": 0.1933, "step": 1331 }, { "epoch": 0.18221614227086183, - "grad_norm": 1.3225483944758218, + "grad_norm": 1.2858863815589716, "learning_rate": 9.20288475182449e-06, - "loss": 0.1937, + "loss": 0.1935, "step": 1332 }, { "epoch": 0.18235294117647058, - "grad_norm": 1.5413338025298715, + "grad_norm": 1.5421930019707306, "learning_rate": 9.201720358189464e-06, - "loss": 0.2353, + "loss": 0.235, "step": 1333 }, { "epoch": 0.18248974008207935, - "grad_norm": 1.2496145367606246, + "grad_norm": 1.2291746504741914, "learning_rate": 9.200555188500103e-06, - "loss": 0.1673, + "loss": 0.1675, "step": 1334 }, { "epoch": 0.1826265389876881, - "grad_norm": 1.46484673660515, + "grad_norm": 1.4310079213010756, "learning_rate": 9.199389242971613e-06, - "loss": 0.2027, + "loss": 0.2033, "step": 1335 }, { "epoch": 0.18276333789329685, - "grad_norm": 1.5376152787507018, + "grad_norm": 1.4671994522404406, "learning_rate": 9.19822252181934e-06, - "loss": 0.2637, + "loss": 0.2606, "step": 1336 }, { "epoch": 0.1829001367989056, - "grad_norm": 1.6528878063833476, + "grad_norm": 1.6219927639386647, "learning_rate": 9.197055025258779e-06, - "loss": 0.2808, + "loss": 0.2789, "step": 1337 }, { "epoch": 0.18303693570451438, - "grad_norm": 1.331586113622358, + "grad_norm": 1.2963432081753348, "learning_rate": 9.195886753505564e-06, - "loss": 0.21, + "loss": 0.2092, "step": 1338 }, { "epoch": 0.18317373461012312, - "grad_norm": 1.5867875226564663, + "grad_norm": 1.5454442512198319, "learning_rate": 9.194717706775479e-06, - "loss": 0.2306, + "loss": 0.2319, "step": 1339 }, { "epoch": 0.18331053351573187, - "grad_norm": 1.3830670826509655, + "grad_norm": 1.3585802254056165, "learning_rate": 9.19354788528444e-06, - "loss": 0.2175, + "loss": 0.2151, "step": 1340 }, { "epoch": 0.18344733242134062, - "grad_norm": 1.3851999959016967, + "grad_norm": 1.3628158789301417, "learning_rate": 9.192377289248513e-06, - "loss": 0.1769, + "loss": 0.1779, "step": 1341 }, { "epoch": 0.1835841313269494, - "grad_norm": 1.609974626110721, + "grad_norm": 1.5843058542254551, "learning_rate": 9.19120591888391e-06, - "loss": 0.2484, + "loss": 0.2508, "step": 1342 }, { "epoch": 0.18372093023255814, - "grad_norm": 1.4499099616429931, + "grad_norm": 1.436652512525615, "learning_rate": 9.190033774406977e-06, - "loss": 0.1895, + "loss": 0.1919, "step": 1343 }, { "epoch": 0.1838577291381669, - "grad_norm": 1.3075265592450716, + "grad_norm": 1.2911318858749656, "learning_rate": 9.188860856034212e-06, "loss": 0.197, "step": 1344 }, { "epoch": 0.18399452804377564, - "grad_norm": 1.775741265231514, + "grad_norm": 1.7337376682677594, "learning_rate": 9.18768716398225e-06, - "loss": 0.3004, + "loss": 0.2989, "step": 1345 }, { "epoch": 0.18413132694938442, - "grad_norm": 1.32187552955207, + "grad_norm": 1.2814091714026103, "learning_rate": 9.186512698467872e-06, - "loss": 0.2206, + "loss": 0.2188, "step": 1346 }, { "epoch": 0.18426812585499316, - "grad_norm": 1.3118723825984602, + "grad_norm": 1.2857119327998854, "learning_rate": 9.185337459707999e-06, - "loss": 0.1899, + "loss": 0.1905, "step": 1347 }, { "epoch": 0.1844049247606019, - "grad_norm": 1.575082384248103, + "grad_norm": 1.5094115232327356, "learning_rate": 9.184161447919699e-06, - "loss": 0.2432, + "loss": 0.2412, "step": 1348 }, { "epoch": 0.18454172366621066, - "grad_norm": 1.914917749196519, + "grad_norm": 1.9100630501971048, "learning_rate": 9.182984663320181e-06, - "loss": 0.2886, + "loss": 0.2916, "step": 1349 }, { "epoch": 0.18467852257181944, - "grad_norm": 1.6329707742576587, + "grad_norm": 1.5885181826455217, "learning_rate": 9.181807106126792e-06, - "loss": 0.2204, + "loss": 0.2212, "step": 1350 }, { "epoch": 0.18481532147742818, - "grad_norm": 1.2931680439799116, + "grad_norm": 1.2958571234898189, "learning_rate": 9.180628776557031e-06, - "loss": 0.1838, + "loss": 0.1869, "step": 1351 }, { "epoch": 0.18495212038303693, - "grad_norm": 1.5996192785421317, + "grad_norm": 1.5287644894236827, "learning_rate": 9.179449674828531e-06, - "loss": 0.2649, + "loss": 0.261, "step": 1352 }, { "epoch": 0.18508891928864568, - "grad_norm": 1.590910505434767, + "grad_norm": 1.5735556531073471, "learning_rate": 9.178269801159073e-06, - "loss": 0.2268, + "loss": 0.2276, "step": 1353 }, { "epoch": 0.18522571819425446, - "grad_norm": 1.3719547097944418, + "grad_norm": 1.3288330121008622, "learning_rate": 9.177089155766576e-06, - "loss": 0.1656, + "loss": 0.1665, "step": 1354 }, { "epoch": 0.1853625170998632, - "grad_norm": 1.3156226928809935, + "grad_norm": 1.2917754381625621, "learning_rate": 9.175907738869107e-06, - "loss": 0.2163, + "loss": 0.2161, "step": 1355 }, { "epoch": 0.18549931600547195, - "grad_norm": 1.248086241442087, + "grad_norm": 1.2203311494232938, "learning_rate": 9.174725550684871e-06, - "loss": 0.1758, + "loss": 0.1765, "step": 1356 }, { "epoch": 0.1856361149110807, - "grad_norm": 1.5028849561287239, + "grad_norm": 1.4798573090233, "learning_rate": 9.17354259143222e-06, - "loss": 0.2209, + "loss": 0.2208, "step": 1357 }, { "epoch": 0.18577291381668948, - "grad_norm": 1.5322214659120068, + "grad_norm": 1.5215732665642003, "learning_rate": 9.172358861329643e-06, - "loss": 0.2509, + "loss": 0.2518, "step": 1358 }, { "epoch": 0.18590971272229823, - "grad_norm": 1.2379708233095184, + "grad_norm": 1.2004672976536346, "learning_rate": 9.171174360595773e-06, - "loss": 0.1513, + "loss": 0.1514, "step": 1359 }, { "epoch": 0.18604651162790697, - "grad_norm": 1.3933168168901704, + "grad_norm": 1.3630838041098565, "learning_rate": 9.16998908944939e-06, - "loss": 0.2158, + "loss": 0.2161, "step": 1360 }, { "epoch": 0.18618331053351572, - "grad_norm": 1.1301641728006175, + "grad_norm": 1.1114706496870561, "learning_rate": 9.16880304810941e-06, - "loss": 0.1857, + "loss": 0.1839, "step": 1361 }, { "epoch": 0.1863201094391245, - "grad_norm": 1.6509084568051846, + "grad_norm": 1.7442134709698784, "learning_rate": 9.167616236794895e-06, - "loss": 0.234, + "loss": 0.2427, "step": 1362 }, { "epoch": 0.18645690834473325, - "grad_norm": 1.3277427522017173, + "grad_norm": 1.3315414350865138, "learning_rate": 9.166428655725048e-06, - "loss": 0.1859, + "loss": 0.1847, "step": 1363 }, { "epoch": 0.186593707250342, - "grad_norm": 1.2841307542200524, + "grad_norm": 1.2715133577852198, "learning_rate": 9.165240305119212e-06, - "loss": 0.211, + "loss": 0.2093, "step": 1364 }, { "epoch": 0.18673050615595074, - "grad_norm": 1.3712731757825407, + "grad_norm": 1.3473037605461178, "learning_rate": 9.164051185196878e-06, - "loss": 0.2161, + "loss": 0.2184, "step": 1365 }, { "epoch": 0.18686730506155952, - "grad_norm": 1.3129917459062501, + "grad_norm": 1.2884483844929948, "learning_rate": 9.162861296177672e-06, - "loss": 0.1902, + "loss": 0.1925, "step": 1366 }, { "epoch": 0.18700410396716827, - "grad_norm": 1.8033578861077257, + "grad_norm": 1.767164008005362, "learning_rate": 9.161670638281368e-06, - "loss": 0.2625, + "loss": 0.2634, "step": 1367 }, { "epoch": 0.18714090287277702, - "grad_norm": 1.3602018821704533, + "grad_norm": 1.329523612414503, "learning_rate": 9.160479211727878e-06, - "loss": 0.2314, + "loss": 0.2306, "step": 1368 }, { "epoch": 0.18727770177838576, - "grad_norm": 1.5434248340588643, + "grad_norm": 1.5171521725260506, "learning_rate": 9.15928701673726e-06, - "loss": 0.2223, + "loss": 0.2215, "step": 1369 }, { "epoch": 0.18741450068399454, - "grad_norm": 1.5676506496058273, + "grad_norm": 1.5250658256846994, "learning_rate": 9.158094053529709e-06, - "loss": 0.2633, + "loss": 0.2605, "step": 1370 }, { "epoch": 0.1875512995896033, - "grad_norm": 1.4418766437642825, + "grad_norm": 1.391725570406238, "learning_rate": 9.156900322325564e-06, - "loss": 0.1727, + "loss": 0.1679, "step": 1371 }, { "epoch": 0.18768809849521204, - "grad_norm": 1.5834282244511266, + "grad_norm": 1.5610980473127316, "learning_rate": 9.155705823345308e-06, - "loss": 0.2559, + "loss": 0.2568, "step": 1372 }, { "epoch": 0.18782489740082078, - "grad_norm": 1.0678690031386824, + "grad_norm": 1.0600158146340015, "learning_rate": 9.154510556809564e-06, - "loss": 0.1703, + "loss": 0.1725, "step": 1373 }, { "epoch": 0.18796169630642956, - "grad_norm": 1.265518252487199, + "grad_norm": 1.249577878573474, "learning_rate": 9.153314522939096e-06, - "loss": 0.2153, + "loss": 0.2173, "step": 1374 }, { "epoch": 0.1880984952120383, - "grad_norm": 1.7112025453059845, + "grad_norm": 1.6656393780584542, "learning_rate": 9.152117721954809e-06, - "loss": 0.2318, + "loss": 0.2309, "step": 1375 }, { "epoch": 0.18823529411764706, - "grad_norm": 1.5369699617331163, + "grad_norm": 1.4709742124674807, "learning_rate": 9.150920154077753e-06, - "loss": 0.2066, + "loss": 0.2081, "step": 1376 }, { "epoch": 0.1883720930232558, - "grad_norm": 1.3724054832808479, + "grad_norm": 1.373407454568827, "learning_rate": 9.149721819529119e-06, - "loss": 0.2401, + "loss": 0.2329, "step": 1377 }, { "epoch": 0.18850889192886458, - "grad_norm": 1.3212460320608868, + "grad_norm": 1.2898466945468707, "learning_rate": 9.148522718530237e-06, - "loss": 0.1886, + "loss": 0.1867, "step": 1378 }, { "epoch": 0.18864569083447333, - "grad_norm": 1.3708671665400325, + "grad_norm": 1.3326023252334829, "learning_rate": 9.147322851302578e-06, - "loss": 0.1556, + "loss": 0.1558, "step": 1379 }, { "epoch": 0.18878248974008208, - "grad_norm": 1.0442939157429159, + "grad_norm": 1.0285032861898264, "learning_rate": 9.146122218067759e-06, - "loss": 0.1764, + "loss": 0.1754, "step": 1380 }, { "epoch": 0.18891928864569082, - "grad_norm": 1.1347516339257875, + "grad_norm": 1.1141728099560353, "learning_rate": 9.144920819047535e-06, - "loss": 0.173, + "loss": 0.1724, "step": 1381 }, { "epoch": 0.1890560875512996, - "grad_norm": 1.4246361960901628, + "grad_norm": 1.4090321663650018, "learning_rate": 9.143718654463804e-06, - "loss": 0.2095, + "loss": 0.209, "step": 1382 }, { "epoch": 0.18919288645690835, - "grad_norm": 1.3965301627925892, + "grad_norm": 1.3687005151955602, "learning_rate": 9.142515724538605e-06, - "loss": 0.1955, + "loss": 0.1953, "step": 1383 }, { "epoch": 0.1893296853625171, - "grad_norm": 1.525526878679336, + "grad_norm": 1.5085773549350887, "learning_rate": 9.141312029494119e-06, - "loss": 0.2367, + "loss": 0.2381, "step": 1384 }, { "epoch": 0.18946648426812585, - "grad_norm": 1.6005385217173007, + "grad_norm": 1.600248085522, "learning_rate": 9.140107569552665e-06, - "loss": 0.2595, + "loss": 0.262, "step": 1385 }, { "epoch": 0.18960328317373462, - "grad_norm": 1.493026925200361, + "grad_norm": 1.4812740855399689, "learning_rate": 9.138902344936706e-06, - "loss": 0.2215, + "loss": 0.2244, "step": 1386 }, { "epoch": 0.18974008207934337, - "grad_norm": 1.4088844090041768, + "grad_norm": 1.3869283459342212, "learning_rate": 9.137696355868848e-06, - "loss": 0.1981, + "loss": 0.1956, "step": 1387 }, { "epoch": 0.18987688098495212, - "grad_norm": 1.499392267745771, + "grad_norm": 1.4649463893312162, "learning_rate": 9.136489602571837e-06, - "loss": 0.2466, + "loss": 0.2382, "step": 1388 }, { "epoch": 0.19001367989056087, - "grad_norm": 1.3039057217304102, + "grad_norm": 1.2710642610298406, "learning_rate": 9.135282085268555e-06, - "loss": 0.1859, + "loss": 0.1839, "step": 1389 }, { "epoch": 0.19015047879616964, - "grad_norm": 1.2477044055437985, + "grad_norm": 1.2352883063321605, "learning_rate": 9.134073804182034e-06, - "loss": 0.2079, + "loss": 0.2109, "step": 1390 }, { "epoch": 0.1902872777017784, - "grad_norm": 1.2465735301916028, + "grad_norm": 1.2336838748540593, "learning_rate": 9.132864759535438e-06, - "loss": 0.2039, + "loss": 0.2005, "step": 1391 }, { "epoch": 0.19042407660738714, - "grad_norm": 1.321678538600098, + "grad_norm": 1.2717866597563856, "learning_rate": 9.131654951552082e-06, - "loss": 0.226, + "loss": 0.2239, "step": 1392 }, { "epoch": 0.1905608755129959, - "grad_norm": 1.1860676178159426, + "grad_norm": 1.1872899905176413, "learning_rate": 9.130444380455413e-06, - "loss": 0.1653, + "loss": 0.1636, "step": 1393 }, { "epoch": 0.19069767441860466, - "grad_norm": 1.473866091159487, + "grad_norm": 1.4547062063835927, "learning_rate": 9.129233046469021e-06, - "loss": 0.2542, + "loss": 0.2528, "step": 1394 }, { "epoch": 0.1908344733242134, - "grad_norm": 1.3667487843724835, + "grad_norm": 1.3472466800018834, "learning_rate": 9.128020949816642e-06, - "loss": 0.2217, + "loss": 0.2207, "step": 1395 }, { "epoch": 0.19097127222982216, - "grad_norm": 1.392650824646126, + "grad_norm": 1.3879491692367527, "learning_rate": 9.12680809072215e-06, - "loss": 0.2151, + "loss": 0.2177, "step": 1396 }, { "epoch": 0.1911080711354309, - "grad_norm": 1.6107359262651126, + "grad_norm": 1.5923662548901951, "learning_rate": 9.125594469409555e-06, - "loss": 0.2551, + "loss": 0.2565, "step": 1397 }, { "epoch": 0.19124487004103968, - "grad_norm": 1.5853056156940672, + "grad_norm": 1.525120423662321, "learning_rate": 9.124380086103012e-06, - "loss": 0.1817, + "loss": 0.1819, "step": 1398 }, { "epoch": 0.19138166894664843, - "grad_norm": 1.3031195974211702, + "grad_norm": 1.3799813642324912, "learning_rate": 9.123164941026824e-06, - "loss": 0.2013, + "loss": 0.2039, "step": 1399 }, { "epoch": 0.19151846785225718, - "grad_norm": 1.7852638740967042, + "grad_norm": 1.7462962905081032, "learning_rate": 9.121949034405417e-06, - "loss": 0.2526, + "loss": 0.2516, "step": 1400 }, { "epoch": 0.19151846785225718, - "eval_loss": 0.2155861258506775, - "eval_runtime": 5.921, - "eval_samples_per_second": 5.067, - "eval_steps_per_second": 1.351, + "eval_loss": 0.21611760556697845, + "eval_runtime": 5.9151, + "eval_samples_per_second": 5.072, + "eval_steps_per_second": 1.352, "step": 1400 }, { "epoch": 0.19165526675786593, - "grad_norm": 1.2661580061288815, + "grad_norm": 1.2171275052632509, "learning_rate": 9.120732366463373e-06, - "loss": 0.2004, + "loss": 0.2, "step": 1401 }, { "epoch": 0.1917920656634747, - "grad_norm": 1.7150660269439113, + "grad_norm": 1.7840694686080496, "learning_rate": 9.119514937425411e-06, - "loss": 0.1873, + "loss": 0.1883, "step": 1402 }, { "epoch": 0.19192886456908345, - "grad_norm": 1.239217613284651, + "grad_norm": 1.2080525052040327, "learning_rate": 9.118296747516387e-06, - "loss": 0.1954, + "loss": 0.1944, "step": 1403 }, { "epoch": 0.1920656634746922, - "grad_norm": 1.325323256862925, + "grad_norm": 1.3066138913271328, "learning_rate": 9.1170777969613e-06, - "loss": 0.1821, + "loss": 0.1823, "step": 1404 }, { "epoch": 0.19220246238030095, - "grad_norm": 1.3061725882035953, + "grad_norm": 1.2794092175551883, "learning_rate": 9.11585808598529e-06, - "loss": 0.205, + "loss": 0.2028, "step": 1405 }, { "epoch": 0.19233926128590972, - "grad_norm": 1.3880571571562235, + "grad_norm": 1.36773225373261, "learning_rate": 9.114637614813636e-06, - "loss": 0.2003, + "loss": 0.2014, "step": 1406 }, { "epoch": 0.19247606019151847, - "grad_norm": 1.6130938254360265, + "grad_norm": 1.5606293523522905, "learning_rate": 9.113416383671756e-06, - "loss": 0.2126, + "loss": 0.2101, "step": 1407 }, { "epoch": 0.19261285909712722, - "grad_norm": 1.4372950401441498, + "grad_norm": 1.4006378269302813, "learning_rate": 9.112194392785215e-06, - "loss": 0.1891, + "loss": 0.1889, "step": 1408 }, { "epoch": 0.19274965800273597, - "grad_norm": 1.4132817202392942, + "grad_norm": 1.3843194657513507, "learning_rate": 9.11097164237971e-06, - "loss": 0.2097, + "loss": 0.2103, "step": 1409 }, { "epoch": 0.19288645690834474, - "grad_norm": 1.373844433246324, + "grad_norm": 1.3341370084446194, "learning_rate": 9.109748132681082e-06, - "loss": 0.2064, + "loss": 0.2035, "step": 1410 }, { "epoch": 0.1930232558139535, - "grad_norm": 1.5381526401961285, + "grad_norm": 1.4803260779907679, "learning_rate": 9.108523863915316e-06, - "loss": 0.2672, + "loss": 0.2671, "step": 1411 }, { "epoch": 0.19316005471956224, - "grad_norm": 1.393739567279012, + "grad_norm": 1.345949725041744, "learning_rate": 9.107298836308527e-06, - "loss": 0.2014, + "loss": 0.1998, "step": 1412 }, { "epoch": 0.193296853625171, - "grad_norm": 1.4680100492718091, + "grad_norm": 1.3460403718148115, "learning_rate": 9.106073050086983e-06, - "loss": 0.1888, + "loss": 0.1902, "step": 1413 }, { "epoch": 0.19343365253077976, - "grad_norm": 1.3373447872871562, + "grad_norm": 1.3063368654004635, "learning_rate": 9.104846505477083e-06, - "loss": 0.1929, + "loss": 0.1907, "step": 1414 }, { "epoch": 0.1935704514363885, - "grad_norm": 1.3525936467120823, + "grad_norm": 1.3366831546222975, "learning_rate": 9.103619202705368e-06, - "loss": 0.1962, + "loss": 0.1984, "step": 1415 }, { "epoch": 0.19370725034199726, - "grad_norm": 1.3689710419762267, + "grad_norm": 1.3344759996989113, "learning_rate": 9.10239114199852e-06, - "loss": 0.1861, + "loss": 0.1866, "step": 1416 }, { "epoch": 0.193844049247606, - "grad_norm": 1.395541866543735, + "grad_norm": 1.4084942500148372, "learning_rate": 9.101162323583365e-06, - "loss": 0.2016, + "loss": 0.205, "step": 1417 }, { "epoch": 0.19398084815321479, - "grad_norm": 1.2498124504812071, + "grad_norm": 1.2540605032126348, "learning_rate": 9.09993274768686e-06, - "loss": 0.2159, + "loss": 0.2151, "step": 1418 }, { "epoch": 0.19411764705882353, - "grad_norm": 1.4782175770044386, + "grad_norm": 1.4461279426062694, "learning_rate": 9.098702414536107e-06, - "loss": 0.2267, + "loss": 0.2272, "step": 1419 }, { "epoch": 0.19425444596443228, - "grad_norm": 1.6339744790477186, + "grad_norm": 1.5235464768705285, "learning_rate": 9.09747132435835e-06, - "loss": 0.1934, + "loss": 0.1904, "step": 1420 }, { "epoch": 0.19439124487004103, - "grad_norm": 1.4278965055945234, + "grad_norm": 1.383884159762789, "learning_rate": 9.096239477380965e-06, - "loss": 0.1709, + "loss": 0.1693, "step": 1421 }, { "epoch": 0.1945280437756498, - "grad_norm": 1.024856336372032, + "grad_norm": 1.0152345160758698, "learning_rate": 9.09500687383148e-06, - "loss": 0.1653, + "loss": 0.1672, "step": 1422 }, { "epoch": 0.19466484268125855, - "grad_norm": 1.3048912243456532, + "grad_norm": 1.268483558503627, "learning_rate": 9.093773513937555e-06, - "loss": 0.1955, + "loss": 0.197, "step": 1423 }, { "epoch": 0.1948016415868673, - "grad_norm": 1.5443915059416495, + "grad_norm": 1.528869588422459, "learning_rate": 9.092539397926985e-06, - "loss": 0.1898, + "loss": 0.1908, "step": 1424 }, { "epoch": 0.19493844049247605, - "grad_norm": 1.3939499225762229, + "grad_norm": 1.3390848616133117, "learning_rate": 9.091304526027714e-06, - "loss": 0.1808, + "loss": 0.1774, "step": 1425 }, { "epoch": 0.19507523939808483, - "grad_norm": 1.4363697973267793, + "grad_norm": 1.3985067346394582, "learning_rate": 9.090068898467824e-06, - "loss": 0.1943, + "loss": 0.1945, "step": 1426 }, { "epoch": 0.19521203830369357, - "grad_norm": 1.360560727420577, + "grad_norm": 1.3430242200132478, "learning_rate": 9.08883251547553e-06, - "loss": 0.2436, + "loss": 0.2433, "step": 1427 }, { "epoch": 0.19534883720930232, - "grad_norm": 1.488014992336903, + "grad_norm": 1.4533920181032234, "learning_rate": 9.087595377279192e-06, - "loss": 0.231, + "loss": 0.2306, "step": 1428 }, { "epoch": 0.19548563611491107, - "grad_norm": 1.2715959882738572, + "grad_norm": 1.270019379103238, "learning_rate": 9.086357484107311e-06, - "loss": 0.1952, + "loss": 0.1961, "step": 1429 }, { "epoch": 0.19562243502051985, - "grad_norm": 1.255778449174611, + "grad_norm": 1.2440105065678904, "learning_rate": 9.085118836188523e-06, - "loss": 0.1938, + "loss": 0.1954, "step": 1430 }, { "epoch": 0.1957592339261286, - "grad_norm": 1.555178509083719, + "grad_norm": 1.519636432146527, "learning_rate": 9.083879433751605e-06, - "loss": 0.2132, + "loss": 0.2114, "step": 1431 }, { "epoch": 0.19589603283173734, - "grad_norm": 1.1507129967216612, + "grad_norm": 1.0842121479547475, "learning_rate": 9.082639277025473e-06, - "loss": 0.1685, + "loss": 0.1652, "step": 1432 }, { "epoch": 0.1960328317373461, - "grad_norm": 1.899145005764797, + "grad_norm": 1.8565705187742032, "learning_rate": 9.081398366239186e-06, - "loss": 0.2318, + "loss": 0.233, "step": 1433 }, { "epoch": 0.19616963064295487, - "grad_norm": 1.6728517800082374, + "grad_norm": 1.65569968344286, "learning_rate": 9.080156701621937e-06, - "loss": 0.2224, + "loss": 0.2248, "step": 1434 }, { "epoch": 0.19630642954856362, - "grad_norm": 1.456151714916658, + "grad_norm": 1.4188874129817906, "learning_rate": 9.07891428340306e-06, - "loss": 0.2434, + "loss": 0.2438, "step": 1435 }, { "epoch": 0.19644322845417236, - "grad_norm": 1.530320728285694, + "grad_norm": 1.4926560644750402, "learning_rate": 9.077671111812027e-06, - "loss": 0.2167, + "loss": 0.2173, "step": 1436 }, { "epoch": 0.1965800273597811, - "grad_norm": 1.1385387304480215, + "grad_norm": 1.1054946701618196, "learning_rate": 9.076427187078455e-06, "loss": 0.2316, "step": 1437 }, { "epoch": 0.1967168262653899, - "grad_norm": 1.6444880906237134, + "grad_norm": 1.5970782749439343, "learning_rate": 9.075182509432096e-06, - "loss": 0.2249, + "loss": 0.2217, "step": 1438 }, { "epoch": 0.19685362517099864, - "grad_norm": 1.4494569059884541, + "grad_norm": 1.4434043079265297, "learning_rate": 9.073937079102837e-06, - "loss": 0.2535, + "loss": 0.2565, "step": 1439 }, { "epoch": 0.19699042407660738, - "grad_norm": 1.3204646662450097, + "grad_norm": 1.3005819155664837, "learning_rate": 9.07269089632071e-06, - "loss": 0.2234, + "loss": 0.2247, "step": 1440 }, { "epoch": 0.19712722298221613, - "grad_norm": 1.3633848671596935, + "grad_norm": 1.3369281412717413, "learning_rate": 9.071443961315883e-06, - "loss": 0.1777, + "loss": 0.1749, "step": 1441 }, { "epoch": 0.1972640218878249, - "grad_norm": 1.4555011993746187, + "grad_norm": 1.4361996592514334, "learning_rate": 9.070196274318666e-06, - "loss": 0.2332, + "loss": 0.2348, "step": 1442 }, { "epoch": 0.19740082079343366, - "grad_norm": 1.3704477247639364, + "grad_norm": 1.3625982670476189, "learning_rate": 9.068947835559504e-06, - "loss": 0.2293, + "loss": 0.2284, "step": 1443 }, { "epoch": 0.1975376196990424, - "grad_norm": 1.335985861605916, + "grad_norm": 1.3210909478154715, "learning_rate": 9.067698645268983e-06, - "loss": 0.2115, + "loss": 0.2144, "step": 1444 }, { "epoch": 0.19767441860465115, - "grad_norm": 1.409404002527944, + "grad_norm": 1.378691042800875, "learning_rate": 9.066448703677828e-06, - "loss": 0.2107, + "loss": 0.2105, "step": 1445 }, { "epoch": 0.19781121751025993, - "grad_norm": 1.4358019040474115, + "grad_norm": 1.393719181353408, "learning_rate": 9.0651980110169e-06, - "loss": 0.2084, + "loss": 0.2068, "step": 1446 }, { "epoch": 0.19794801641586868, - "grad_norm": 1.5640745993280605, + "grad_norm": 1.5282465617948022, "learning_rate": 9.063946567517205e-06, - "loss": 0.2561, + "loss": 0.2544, "step": 1447 }, { "epoch": 0.19808481532147743, - "grad_norm": 1.7078146299682764, + "grad_norm": 1.6384298533684893, "learning_rate": 9.06269437340988e-06, - "loss": 0.2882, + "loss": 0.2875, "step": 1448 }, { "epoch": 0.19822161422708617, - "grad_norm": 1.3535407764744316, + "grad_norm": 1.3083601577390935, "learning_rate": 9.061441428926205e-06, - "loss": 0.2011, + "loss": 0.1992, "step": 1449 }, { "epoch": 0.19835841313269495, - "grad_norm": 1.2987285994119178, + "grad_norm": 1.267474085153699, "learning_rate": 9.0601877342976e-06, - "loss": 0.2171, + "loss": 0.2147, "step": 1450 }, { "epoch": 0.1984952120383037, - "grad_norm": 1.4567606433633904, + "grad_norm": 1.4829694565877802, "learning_rate": 9.058933289755618e-06, - "loss": 0.1815, + "loss": 0.1831, "step": 1451 }, { "epoch": 0.19863201094391245, - "grad_norm": 1.2850538196547274, + "grad_norm": 1.2600862911802888, "learning_rate": 9.057678095531957e-06, - "loss": 0.1884, + "loss": 0.1916, "step": 1452 }, { "epoch": 0.1987688098495212, - "grad_norm": 1.247532440463193, + "grad_norm": 1.2172191057009167, "learning_rate": 9.056422151858449e-06, - "loss": 0.1969, + "loss": 0.197, "step": 1453 }, { "epoch": 0.19890560875512997, - "grad_norm": 1.5326962427468662, + "grad_norm": 1.4987609187360522, "learning_rate": 9.055165458967064e-06, - "loss": 0.2696, + "loss": 0.2687, "step": 1454 }, { "epoch": 0.19904240766073872, - "grad_norm": 1.2057118346453055, + "grad_norm": 1.1746546761585177, "learning_rate": 9.053908017089915e-06, - "loss": 0.2115, + "loss": 0.2103, "step": 1455 }, { "epoch": 0.19917920656634747, - "grad_norm": 1.6731373397409797, + "grad_norm": 1.6501538410028223, "learning_rate": 9.052649826459248e-06, - "loss": 0.2568, + "loss": 0.2529, "step": 1456 }, { "epoch": 0.19931600547195621, - "grad_norm": 1.2286558319056216, + "grad_norm": 1.2158748143541136, "learning_rate": 9.051390887307452e-06, - "loss": 0.2072, + "loss": 0.2084, "step": 1457 }, { "epoch": 0.199452804377565, - "grad_norm": 1.3260864356691349, + "grad_norm": 1.307648451247919, "learning_rate": 9.050131199867051e-06, - "loss": 0.179, + "loss": 0.1787, "step": 1458 }, { "epoch": 0.19958960328317374, - "grad_norm": 1.33690428377077, + "grad_norm": 1.31510584584757, "learning_rate": 9.04887076437071e-06, - "loss": 0.1749, + "loss": 0.1755, "step": 1459 }, { "epoch": 0.1997264021887825, - "grad_norm": 1.4724489645837358, + "grad_norm": 1.4250914764041016, "learning_rate": 9.047609581051226e-06, - "loss": 0.2318, + "loss": 0.2314, "step": 1460 }, { "epoch": 0.19986320109439124, - "grad_norm": 1.4189905556576938, + "grad_norm": 1.3871672390936278, "learning_rate": 9.046347650141545e-06, - "loss": 0.2402, + "loss": 0.2384, "step": 1461 }, { "epoch": 0.2, - "grad_norm": 1.1185970345478848, + "grad_norm": 1.0741446804761936, "learning_rate": 9.045084971874738e-06, - "loss": 0.1821, + "loss": 0.1803, "step": 1462 }, { "epoch": 0.20013679890560876, - "grad_norm": 1.297622268122839, + "grad_norm": 1.2791880843807157, "learning_rate": 9.043821546484023e-06, - "loss": 0.1793, + "loss": 0.1781, "step": 1463 }, { "epoch": 0.2002735978112175, - "grad_norm": 1.349682907182022, + "grad_norm": 1.3160236408711816, "learning_rate": 9.042557374202757e-06, - "loss": 0.2181, + "loss": 0.2158, "step": 1464 }, { "epoch": 0.20041039671682626, - "grad_norm": 1.685636409309613, + "grad_norm": 1.6458046091132406, "learning_rate": 9.041292455264428e-06, - "loss": 0.2342, + "loss": 0.2339, "step": 1465 }, { "epoch": 0.20054719562243503, - "grad_norm": 1.5601084117511916, + "grad_norm": 1.5185253690101288, "learning_rate": 9.040026789902666e-06, - "loss": 0.26, + "loss": 0.2561, "step": 1466 }, { "epoch": 0.20068399452804378, - "grad_norm": 1.3251130606135524, + "grad_norm": 1.2333142785089326, "learning_rate": 9.03876037835124e-06, - "loss": 0.2008, + "loss": 0.195, "step": 1467 }, { "epoch": 0.20082079343365253, - "grad_norm": 1.1554907464269164, + "grad_norm": 1.1479247090670726, "learning_rate": 9.037493220844051e-06, - "loss": 0.1762, + "loss": 0.1795, "step": 1468 }, { "epoch": 0.20095759233926128, - "grad_norm": 1.9815121437742418, + "grad_norm": 1.8866802657541384, "learning_rate": 9.036225317615148e-06, - "loss": 0.2436, + "loss": 0.2371, "step": 1469 }, { "epoch": 0.20109439124487005, - "grad_norm": 1.6875380998523997, + "grad_norm": 1.6838029724854653, "learning_rate": 9.034956668898707e-06, - "loss": 0.2541, + "loss": 0.2583, "step": 1470 }, { "epoch": 0.2012311901504788, - "grad_norm": 1.3455936831801596, + "grad_norm": 1.3071882328668571, "learning_rate": 9.033687274929047e-06, - "loss": 0.2237, + "loss": 0.2215, "step": 1471 }, { "epoch": 0.20136798905608755, - "grad_norm": 1.4202887183046633, + "grad_norm": 1.384288727570497, "learning_rate": 9.032417135940626e-06, - "loss": 0.2006, + "loss": 0.1998, "step": 1472 }, { "epoch": 0.2015047879616963, - "grad_norm": 1.5573229899628698, + "grad_norm": 1.51640558234013, "learning_rate": 9.031146252168037e-06, - "loss": 0.2439, + "loss": 0.2464, "step": 1473 }, { "epoch": 0.20164158686730507, - "grad_norm": 1.1854844933352167, + "grad_norm": 1.1661882565412818, "learning_rate": 9.02987462384601e-06, - "loss": 0.1618, + "loss": 0.1594, "step": 1474 }, { "epoch": 0.20177838577291382, - "grad_norm": 1.258922200827958, + "grad_norm": 1.228242610756491, "learning_rate": 9.028602251209415e-06, - "loss": 0.1955, + "loss": 0.1935, "step": 1475 }, { "epoch": 0.20191518467852257, - "grad_norm": 1.6652250560802129, + "grad_norm": 1.661370257950201, "learning_rate": 9.027329134493259e-06, - "loss": 0.2619, + "loss": 0.2621, "step": 1476 }, { "epoch": 0.20205198358413132, - "grad_norm": 1.1573429264253885, + "grad_norm": 1.1383179971022317, "learning_rate": 9.026055273932683e-06, - "loss": 0.2291, + "loss": 0.2303, "step": 1477 }, { "epoch": 0.2021887824897401, - "grad_norm": 1.4007516473146528, + "grad_norm": 1.4219875178332089, "learning_rate": 9.024780669762969e-06, - "loss": 0.2187, + "loss": 0.2216, "step": 1478 }, { "epoch": 0.20232558139534884, - "grad_norm": 1.3798709044115627, + "grad_norm": 1.4336350697545754, "learning_rate": 9.023505322219537e-06, - "loss": 0.2316, + "loss": 0.2388, "step": 1479 }, { "epoch": 0.2024623803009576, - "grad_norm": 1.5114726238900886, + "grad_norm": 1.4996216104369822, "learning_rate": 9.02222923153794e-06, - "loss": 0.2091, + "loss": 0.2122, "step": 1480 }, { "epoch": 0.20259917920656634, - "grad_norm": 1.3388607507649277, + "grad_norm": 1.2957989392800804, "learning_rate": 9.020952397953874e-06, - "loss": 0.2378, + "loss": 0.237, "step": 1481 }, { "epoch": 0.2027359781121751, - "grad_norm": 1.4462844022293107, + "grad_norm": 1.4233987184057366, "learning_rate": 9.019674821703166e-06, - "loss": 0.2583, + "loss": 0.2565, "step": 1482 }, { "epoch": 0.20287277701778386, - "grad_norm": 1.3380894148438685, + "grad_norm": 1.291226966831767, "learning_rate": 9.018396503021785e-06, - "loss": 0.2208, + "loss": 0.2186, "step": 1483 }, { "epoch": 0.2030095759233926, - "grad_norm": 1.6017607234586153, + "grad_norm": 1.5617735183033798, "learning_rate": 9.017117442145835e-06, - "loss": 0.2228, + "loss": 0.2237, "step": 1484 }, { "epoch": 0.20314637482900136, - "grad_norm": 1.3715302459676413, + "grad_norm": 1.339248193018829, "learning_rate": 9.015837639311558e-06, - "loss": 0.201, + "loss": 0.2016, "step": 1485 }, { "epoch": 0.20328317373461013, - "grad_norm": 1.3812765713046113, + "grad_norm": 1.325055281746842, "learning_rate": 9.014557094755332e-06, - "loss": 0.2096, + "loss": 0.2078, "step": 1486 }, { "epoch": 0.20341997264021888, - "grad_norm": 1.297116501887473, + "grad_norm": 1.2551028537339775, "learning_rate": 9.013275808713674e-06, - "loss": 0.1807, + "loss": 0.1793, "step": 1487 }, { "epoch": 0.20355677154582763, - "grad_norm": 1.3836273110199229, + "grad_norm": 1.4382672210552119, "learning_rate": 9.011993781423234e-06, - "loss": 0.2129, + "loss": 0.211, "step": 1488 }, { "epoch": 0.20369357045143638, - "grad_norm": 1.3227083064450866, + "grad_norm": 1.28529152477462, "learning_rate": 9.010711013120801e-06, - "loss": 0.1777, + "loss": 0.1788, "step": 1489 }, { "epoch": 0.20383036935704515, - "grad_norm": 1.5010269606216773, + "grad_norm": 1.5413661990310161, "learning_rate": 9.009427504043306e-06, - "loss": 0.2008, + "loss": 0.2007, "step": 1490 }, { "epoch": 0.2039671682626539, - "grad_norm": 1.3247805815434366, + "grad_norm": 1.2888716645600822, "learning_rate": 9.008143254427806e-06, - "loss": 0.1917, + "loss": 0.1904, "step": 1491 }, { "epoch": 0.20410396716826265, - "grad_norm": 1.4701968198839013, + "grad_norm": 1.4174037281819813, "learning_rate": 9.006858264511507e-06, "loss": 0.2189, "step": 1492 }, { "epoch": 0.2042407660738714, - "grad_norm": 1.3481835889929161, + "grad_norm": 1.3149176418536481, "learning_rate": 9.00557253453174e-06, "loss": 0.2115, "step": 1493 }, { "epoch": 0.20437756497948018, - "grad_norm": 1.3139327779718795, + "grad_norm": 1.2896561014305754, "learning_rate": 9.004286064725982e-06, - "loss": 0.197, + "loss": 0.196, "step": 1494 }, { "epoch": 0.20451436388508892, - "grad_norm": 1.528968394761537, + "grad_norm": 1.4637798181156132, "learning_rate": 9.002998855331842e-06, - "loss": 0.2275, + "loss": 0.2262, "step": 1495 }, { "epoch": 0.20465116279069767, - "grad_norm": 1.3599986902580028, + "grad_norm": 1.3367447507355448, "learning_rate": 9.001710906587064e-06, - "loss": 0.1839, + "loss": 0.1808, "step": 1496 }, { "epoch": 0.20478796169630642, - "grad_norm": 1.2167867801655956, + "grad_norm": 1.122625110821329, "learning_rate": 9.000422218729534e-06, - "loss": 0.177, + "loss": 0.1763, "step": 1497 }, { "epoch": 0.2049247606019152, - "grad_norm": 1.4997461638715521, + "grad_norm": 1.5084953512490538, "learning_rate": 8.999132791997272e-06, - "loss": 0.1927, + "loss": 0.1943, "step": 1498 }, { "epoch": 0.20506155950752394, - "grad_norm": 1.4137451559445122, + "grad_norm": 1.4180288879517917, "learning_rate": 8.997842626628432e-06, - "loss": 0.1957, + "loss": 0.1949, "step": 1499 }, { "epoch": 0.2051983584131327, - "grad_norm": 1.6259350590876838, + "grad_norm": 1.5754392559387005, "learning_rate": 8.996551722861308e-06, - "loss": 0.2603, + "loss": 0.2591, "step": 1500 }, { "epoch": 0.2051983584131327, - "eval_loss": 0.21266235411167145, - "eval_runtime": 5.9241, - "eval_samples_per_second": 5.064, - "eval_steps_per_second": 1.35, + "eval_loss": 0.21220174431800842, + "eval_runtime": 5.9078, + "eval_samples_per_second": 5.078, + "eval_steps_per_second": 1.354, "step": 1500 }, { "epoch": 0.20533515731874144, - "grad_norm": 1.193743939993514, + "grad_norm": 1.1522256693986042, "learning_rate": 8.995260080934327e-06, - "loss": 0.1612, + "loss": 0.1581, "step": 1501 }, { "epoch": 0.20547195622435022, - "grad_norm": 1.5010423289994035, + "grad_norm": 1.470236169561487, "learning_rate": 8.993967701086057e-06, - "loss": 0.253, + "loss": 0.2514, "step": 1502 }, { "epoch": 0.20560875512995896, - "grad_norm": 1.3237121893899793, + "grad_norm": 1.2044485635051616, "learning_rate": 8.992674583555197e-06, - "loss": 0.1868, + "loss": 0.1832, "step": 1503 }, { "epoch": 0.2057455540355677, - "grad_norm": 1.529770570985996, + "grad_norm": 1.4958650515612657, "learning_rate": 8.991380728580587e-06, "loss": 0.2412, "step": 1504 }, { "epoch": 0.20588235294117646, - "grad_norm": 1.7092909194469548, + "grad_norm": 1.4039722578467524, "learning_rate": 8.990086136401199e-06, - "loss": 0.2414, + "loss": 0.2386, "step": 1505 }, { "epoch": 0.20601915184678524, - "grad_norm": 1.298144533412473, + "grad_norm": 1.253654576109873, "learning_rate": 8.988790807256143e-06, - "loss": 0.2033, + "loss": 0.2012, "step": 1506 }, { "epoch": 0.20615595075239398, - "grad_norm": 1.5310317250014083, + "grad_norm": 1.4990257206027846, "learning_rate": 8.987494741384668e-06, - "loss": 0.2031, + "loss": 0.2034, "step": 1507 }, { "epoch": 0.20629274965800273, - "grad_norm": 1.3628017357883142, + "grad_norm": 1.2982053624886005, "learning_rate": 8.986197939026152e-06, - "loss": 0.2075, + "loss": 0.2011, "step": 1508 }, { "epoch": 0.20642954856361148, - "grad_norm": 1.8134566856240109, + "grad_norm": 1.8001363528264533, "learning_rate": 8.984900400420117e-06, - "loss": 0.2467, + "loss": 0.2488, "step": 1509 }, { "epoch": 0.20656634746922026, - "grad_norm": 1.577615871400182, + "grad_norm": 1.5240451092849463, "learning_rate": 8.983602125806216e-06, - "loss": 0.2239, + "loss": 0.223, "step": 1510 }, { "epoch": 0.206703146374829, - "grad_norm": 1.5140576605445415, + "grad_norm": 1.4996091367730566, "learning_rate": 8.98230311542424e-06, - "loss": 0.2073, + "loss": 0.2107, "step": 1511 }, { "epoch": 0.20683994528043775, - "grad_norm": 1.6458829604628045, + "grad_norm": 1.6247036810865056, "learning_rate": 8.981003369514114e-06, - "loss": 0.2007, + "loss": 0.2003, "step": 1512 }, { "epoch": 0.2069767441860465, - "grad_norm": 1.314136259869521, + "grad_norm": 1.3143567609204443, "learning_rate": 8.9797028883159e-06, - "loss": 0.1706, + "loss": 0.171, "step": 1513 }, { "epoch": 0.20711354309165528, - "grad_norm": 1.5242382270229144, + "grad_norm": 1.520772488505142, "learning_rate": 8.978401672069796e-06, - "loss": 0.2236, + "loss": 0.2237, "step": 1514 }, { "epoch": 0.20725034199726403, - "grad_norm": 1.4139574542026059, + "grad_norm": 1.4267472202715292, "learning_rate": 8.977099721016139e-06, - "loss": 0.2146, + "loss": 0.2168, "step": 1515 }, { "epoch": 0.20738714090287277, - "grad_norm": 1.4088968905136339, + "grad_norm": 1.3800480834518662, "learning_rate": 8.97579703539539e-06, - "loss": 0.2125, + "loss": 0.2141, "step": 1516 }, { "epoch": 0.20752393980848152, - "grad_norm": 1.509842942412011, + "grad_norm": 3.6082656635368013, "learning_rate": 8.974493615448164e-06, - "loss": 0.2022, + "loss": 0.2113, "step": 1517 }, { "epoch": 0.2076607387140903, - "grad_norm": 1.2694775571420929, + "grad_norm": 1.260319864246519, "learning_rate": 8.973189461415193e-06, - "loss": 0.1617, + "loss": 0.1616, "step": 1518 }, { "epoch": 0.20779753761969905, - "grad_norm": 1.350346679651421, + "grad_norm": 1.3367173361902296, "learning_rate": 8.97188457353736e-06, - "loss": 0.219, + "loss": 0.2127, "step": 1519 }, { "epoch": 0.2079343365253078, - "grad_norm": 1.2320924394430885, + "grad_norm": 1.2287100250161362, "learning_rate": 8.970578952055673e-06, - "loss": 0.1933, + "loss": 0.1937, "step": 1520 }, { "epoch": 0.20807113543091654, - "grad_norm": 1.419645640548191, + "grad_norm": 1.189823996313407, "learning_rate": 8.96927259721128e-06, - "loss": 0.2056, + "loss": 0.2091, "step": 1521 }, { "epoch": 0.20820793433652532, - "grad_norm": 1.319012534466381, + "grad_norm": 1.3755382305587087, "learning_rate": 8.967965509245461e-06, - "loss": 0.1937, + "loss": 0.1968, "step": 1522 }, { "epoch": 0.20834473324213407, - "grad_norm": 1.4657338856471223, + "grad_norm": 1.3775144083301543, "learning_rate": 8.96665768839964e-06, - "loss": 0.1708, + "loss": 0.1685, "step": 1523 }, { "epoch": 0.20848153214774282, - "grad_norm": 1.5933190161406612, + "grad_norm": 1.569573246124923, "learning_rate": 8.965349134915367e-06, - "loss": 0.2222, + "loss": 0.2239, "step": 1524 }, { "epoch": 0.20861833105335156, - "grad_norm": 1.343837203651613, + "grad_norm": 1.3027291551665434, "learning_rate": 8.96403984903433e-06, - "loss": 0.1904, + "loss": 0.1897, "step": 1525 }, { "epoch": 0.20875512995896034, - "grad_norm": 1.6028849923381308, + "grad_norm": 1.6022332286504852, "learning_rate": 8.962729830998353e-06, - "loss": 0.2488, + "loss": 0.2492, "step": 1526 }, { "epoch": 0.2088919288645691, - "grad_norm": 1.4960332214882837, + "grad_norm": 1.445802871536364, "learning_rate": 8.961419081049397e-06, - "loss": 0.192, + "loss": 0.1974, "step": 1527 }, { "epoch": 0.20902872777017784, - "grad_norm": 1.5350969088820654, + "grad_norm": 1.501455939312962, "learning_rate": 8.960107599429557e-06, - "loss": 0.2108, + "loss": 0.2101, "step": 1528 }, { "epoch": 0.20916552667578658, - "grad_norm": 1.278146405226061, + "grad_norm": 1.255856499805468, "learning_rate": 8.95879538638106e-06, - "loss": 0.2013, + "loss": 0.1996, "step": 1529 }, { "epoch": 0.20930232558139536, - "grad_norm": 1.8587291722694184, + "grad_norm": 1.7932200959787719, "learning_rate": 8.957482442146271e-06, - "loss": 0.2686, + "loss": 0.2694, "step": 1530 }, { "epoch": 0.2094391244870041, - "grad_norm": 1.4282016459018156, + "grad_norm": 1.3727796714585692, "learning_rate": 8.956168766967691e-06, - "loss": 0.2233, + "loss": 0.222, "step": 1531 }, { "epoch": 0.20957592339261286, - "grad_norm": 1.5673060082984684, + "grad_norm": 1.5641715244047216, "learning_rate": 8.954854361087957e-06, - "loss": 0.2421, + "loss": 0.2431, "step": 1532 }, { "epoch": 0.2097127222982216, - "grad_norm": 1.2914602931828725, + "grad_norm": 1.28234250735355, "learning_rate": 8.953539224749835e-06, - "loss": 0.2182, + "loss": 0.2228, "step": 1533 }, { "epoch": 0.20984952120383038, - "grad_norm": 1.354377246453534, + "grad_norm": 1.3335560900894758, "learning_rate": 8.952223358196229e-06, - "loss": 0.2239, + "loss": 0.2243, "step": 1534 }, { "epoch": 0.20998632010943913, - "grad_norm": 1.3358659710477234, + "grad_norm": 1.302924630608084, "learning_rate": 8.95090676167018e-06, - "loss": 0.2267, + "loss": 0.2241, "step": 1535 }, { "epoch": 0.21012311901504788, - "grad_norm": 1.5244043394504845, + "grad_norm": 1.4729220354369172, "learning_rate": 8.949589435414865e-06, - "loss": 0.1982, + "loss": 0.1981, "step": 1536 }, { "epoch": 0.21025991792065662, - "grad_norm": 1.2465296002741404, + "grad_norm": 1.22353287191692, "learning_rate": 8.948271379673589e-06, - "loss": 0.2024, + "loss": 0.2003, "step": 1537 }, { "epoch": 0.2103967168262654, - "grad_norm": 1.5229383955532185, + "grad_norm": 1.4604748536099261, "learning_rate": 8.946952594689797e-06, - "loss": 0.1973, + "loss": 0.1911, "step": 1538 }, { "epoch": 0.21053351573187415, - "grad_norm": 1.6722904862639127, + "grad_norm": 1.6641299259900528, "learning_rate": 8.94563308070707e-06, - "loss": 0.2465, + "loss": 0.2485, "step": 1539 }, { "epoch": 0.2106703146374829, - "grad_norm": 1.3600574994487713, + "grad_norm": 1.3450307946604239, "learning_rate": 8.944312837969118e-06, - "loss": 0.1927, + "loss": 0.1936, "step": 1540 }, { "epoch": 0.21080711354309165, - "grad_norm": 1.673559791027053, + "grad_norm": 1.567766870831462, "learning_rate": 8.94299186671979e-06, - "loss": 0.2285, + "loss": 0.2238, "step": 1541 }, { "epoch": 0.21094391244870042, - "grad_norm": 1.6977243344657411, + "grad_norm": 1.672396615242598, "learning_rate": 8.941670167203068e-06, - "loss": 0.2306, + "loss": 0.2283, "step": 1542 }, { "epoch": 0.21108071135430917, - "grad_norm": 1.624165551977334, + "grad_norm": 1.6043461269536972, "learning_rate": 8.94034773966307e-06, - "loss": 0.2162, + "loss": 0.2133, "step": 1543 }, { "epoch": 0.21121751025991792, - "grad_norm": 1.1999912294771682, + "grad_norm": 1.2258291974155562, "learning_rate": 8.939024584344043e-06, - "loss": 0.2072, + "loss": 0.2055, "step": 1544 }, { "epoch": 0.21135430916552667, - "grad_norm": 1.6840092696390614, + "grad_norm": 19.411027689739928, "learning_rate": 8.937700701490379e-06, - "loss": 0.225, + "loss": 0.2465, "step": 1545 }, { "epoch": 0.21149110807113544, - "grad_norm": 1.4342466278396542, + "grad_norm": 1.4130000308518345, "learning_rate": 8.936376091346595e-06, - "loss": 0.2296, + "loss": 0.2299, "step": 1546 }, { "epoch": 0.2116279069767442, - "grad_norm": 1.3865016499689808, + "grad_norm": 1.374867982921766, "learning_rate": 8.935050754157343e-06, - "loss": 0.1786, + "loss": 0.1778, "step": 1547 }, { "epoch": 0.21176470588235294, - "grad_norm": 1.538704417636126, + "grad_norm": 1.5444173402650345, "learning_rate": 8.933724690167417e-06, - "loss": 0.2402, + "loss": 0.2413, "step": 1548 }, { "epoch": 0.2119015047879617, - "grad_norm": 1.231676849847987, + "grad_norm": 1.1883557842081909, "learning_rate": 8.932397899621736e-06, "loss": 0.2066, "step": 1549 }, { "epoch": 0.21203830369357046, - "grad_norm": 1.1679550459949661, + "grad_norm": 1.1413639293895215, "learning_rate": 8.931070382765359e-06, - "loss": 0.1711, + "loss": 0.1686, "step": 1550 }, { "epoch": 0.2121751025991792, - "grad_norm": 1.504890455858763, + "grad_norm": 1.4747439313605142, "learning_rate": 8.929742139843476e-06, - "loss": 0.2302, + "loss": 0.2331, "step": 1551 }, { "epoch": 0.21231190150478796, - "grad_norm": 1.3067326250161257, + "grad_norm": 1.2694047861439661, "learning_rate": 8.928413171101414e-06, - "loss": 0.1902, + "loss": 0.1897, "step": 1552 }, { "epoch": 0.2124487004103967, - "grad_norm": 1.4035613267825744, + "grad_norm": 1.3444155553442165, "learning_rate": 8.92708347678463e-06, - "loss": 0.2187, + "loss": 0.2141, "step": 1553 }, { "epoch": 0.21258549931600548, - "grad_norm": 1.2384815170979095, + "grad_norm": 1.2154487022298617, "learning_rate": 8.92575305713872e-06, - "loss": 0.1688, + "loss": 0.1703, "step": 1554 }, { "epoch": 0.21272229822161423, - "grad_norm": 1.6460368057908648, + "grad_norm": 2.011789351563379, "learning_rate": 8.92442191240941e-06, - "loss": 0.2393, + "loss": 0.2411, "step": 1555 }, { "epoch": 0.21285909712722298, - "grad_norm": 1.173315458909631, + "grad_norm": 1.160843329888176, "learning_rate": 8.923090042842561e-06, - "loss": 0.2088, + "loss": 0.2089, "step": 1556 }, { "epoch": 0.21299589603283173, - "grad_norm": 1.7854686395403085, + "grad_norm": 1.7918873899014056, "learning_rate": 8.92175744868417e-06, - "loss": 0.2367, + "loss": 0.2378, "step": 1557 }, { "epoch": 0.2131326949384405, - "grad_norm": 1.4010794074759414, + "grad_norm": 1.3432549633753514, "learning_rate": 8.920424130180364e-06, - "loss": 0.2032, + "loss": 0.2028, "step": 1558 }, { "epoch": 0.21326949384404925, - "grad_norm": 1.2132765559780396, + "grad_norm": 1.1487913422891947, "learning_rate": 8.919090087577406e-06, - "loss": 0.1599, + "loss": 0.1595, "step": 1559 }, { "epoch": 0.213406292749658, - "grad_norm": 1.5348329071268103, + "grad_norm": 1.4635219147351064, "learning_rate": 8.917755321121695e-06, - "loss": 0.2133, + "loss": 0.2131, "step": 1560 }, { "epoch": 0.21354309165526675, - "grad_norm": 1.268683412727991, + "grad_norm": 1.2567984994102326, "learning_rate": 8.91641983105976e-06, - "loss": 0.2118, + "loss": 0.2153, "step": 1561 }, { "epoch": 0.21367989056087552, - "grad_norm": 1.690134216439318, + "grad_norm": 1.60260101504497, "learning_rate": 8.915083617638262e-06, - "loss": 0.3196, + "loss": 0.3168, "step": 1562 }, { "epoch": 0.21381668946648427, - "grad_norm": 1.5364529445302166, + "grad_norm": 1.479208520714773, "learning_rate": 8.913746681104004e-06, - "loss": 0.2148, + "loss": 0.2119, "step": 1563 }, { "epoch": 0.21395348837209302, - "grad_norm": 1.3707558590927178, + "grad_norm": 1.339806652749052, "learning_rate": 8.912409021703914e-06, - "loss": 0.1999, + "loss": 0.1971, "step": 1564 }, { "epoch": 0.21409028727770177, - "grad_norm": 1.4330221808141936, + "grad_norm": 1.4280430091221368, "learning_rate": 8.911070639685055e-06, - "loss": 0.2183, + "loss": 0.2177, "step": 1565 }, { "epoch": 0.21422708618331054, - "grad_norm": 1.6533611141017752, + "grad_norm": 1.6208035789108197, "learning_rate": 8.909731535294628e-06, - "loss": 0.2411, + "loss": 0.2426, "step": 1566 }, { "epoch": 0.2143638850889193, - "grad_norm": 1.310806437108173, + "grad_norm": 1.288072260813132, "learning_rate": 8.908391708779963e-06, - "loss": 0.1873, + "loss": 0.1902, "step": 1567 }, { "epoch": 0.21450068399452804, - "grad_norm": 1.2994755168051813, + "grad_norm": 1.3721304017608127, "learning_rate": 8.907051160388525e-06, - "loss": 0.1993, + "loss": 0.2043, "step": 1568 }, { "epoch": 0.2146374829001368, - "grad_norm": 1.5213324870295732, + "grad_norm": 1.4049345216736653, "learning_rate": 8.905709890367913e-06, - "loss": 0.2125, + "loss": 0.21, "step": 1569 }, { "epoch": 0.21477428180574556, - "grad_norm": 1.2898164316804284, + "grad_norm": 1.316265079963348, "learning_rate": 8.904367898965857e-06, - "loss": 0.213, + "loss": 0.2177, "step": 1570 }, { "epoch": 0.2149110807113543, - "grad_norm": 1.6076978466383705, + "grad_norm": 1.5955917957140653, "learning_rate": 8.903025186430225e-06, - "loss": 0.2459, + "loss": 0.2466, "step": 1571 }, { "epoch": 0.21504787961696306, - "grad_norm": 1.1778354419534895, + "grad_norm": 1.1530577465286145, "learning_rate": 8.90168175300901e-06, - "loss": 0.1737, + "loss": 0.1742, "step": 1572 }, { "epoch": 0.2151846785225718, - "grad_norm": 1.441593389410067, + "grad_norm": 1.3982792173928085, "learning_rate": 8.900337598950348e-06, - "loss": 0.2281, + "loss": 0.2274, "step": 1573 }, { "epoch": 0.21532147742818059, - "grad_norm": 1.0286627964455937, + "grad_norm": 0.9885199829914427, "learning_rate": 8.898992724502498e-06, - "loss": 0.1753, + "loss": 0.175, "step": 1574 }, { "epoch": 0.21545827633378933, - "grad_norm": 1.2943089616936743, + "grad_norm": 1.264263596230031, "learning_rate": 8.897647129913862e-06, - "loss": 0.2068, + "loss": 0.2039, "step": 1575 }, { "epoch": 0.21559507523939808, - "grad_norm": 1.1690215797962915, + "grad_norm": 1.1834341376287758, "learning_rate": 8.896300815432968e-06, - "loss": 0.1852, + "loss": 0.1882, "step": 1576 }, { "epoch": 0.21573187414500683, - "grad_norm": 1.256989691532794, + "grad_norm": 1.248938496492723, "learning_rate": 8.89495378130848e-06, - "loss": 0.1609, + "loss": 0.1641, "step": 1577 }, { "epoch": 0.2158686730506156, - "grad_norm": 1.6467832890038776, + "grad_norm": 1.5692328966818296, "learning_rate": 8.893606027789191e-06, - "loss": 0.2323, + "loss": 0.2263, "step": 1578 }, { "epoch": 0.21600547195622435, - "grad_norm": 1.494006867852853, + "grad_norm": 1.4405254011220119, "learning_rate": 8.892257555124035e-06, - "loss": 0.2079, + "loss": 0.2058, "step": 1579 }, { "epoch": 0.2161422708618331, - "grad_norm": 1.312008559210357, + "grad_norm": 1.3098154763936727, "learning_rate": 8.890908363562069e-06, - "loss": 0.1982, + "loss": 0.2007, "step": 1580 }, { "epoch": 0.21627906976744185, - "grad_norm": 1.4467254157958165, + "grad_norm": 1.419583350090061, "learning_rate": 8.889558453352493e-06, - "loss": 0.2179, + "loss": 0.2188, "step": 1581 }, { "epoch": 0.21641586867305063, - "grad_norm": 1.2593652061973213, + "grad_norm": 1.2645361531734658, "learning_rate": 8.888207824744629e-06, - "loss": 0.1989, + "loss": 0.1998, "step": 1582 }, { "epoch": 0.21655266757865937, - "grad_norm": 1.1743973524739753, + "grad_norm": 1.1537458928455155, "learning_rate": 8.88685647798794e-06, - "loss": 0.1823, + "loss": 0.1814, "step": 1583 }, { "epoch": 0.21668946648426812, - "grad_norm": 1.3172570016960736, + "grad_norm": 1.293245334535528, "learning_rate": 8.885504413332018e-06, - "loss": 0.2058, + "loss": 0.2063, "step": 1584 }, { "epoch": 0.21682626538987687, - "grad_norm": 1.526863839552232, + "grad_norm": 1.5642657668482993, "learning_rate": 8.884151631026587e-06, - "loss": 0.2119, + "loss": 0.2162, "step": 1585 }, { "epoch": 0.21696306429548565, - "grad_norm": 1.4099917720089756, + "grad_norm": 1.3740481727719351, "learning_rate": 8.882798131321507e-06, - "loss": 0.2126, + "loss": 0.2104, "step": 1586 }, { "epoch": 0.2170998632010944, - "grad_norm": 1.4680073322108336, + "grad_norm": 1.4522525652359688, "learning_rate": 8.881443914466767e-06, - "loss": 0.2163, + "loss": 0.2155, "step": 1587 }, { "epoch": 0.21723666210670314, - "grad_norm": 1.3201855855999491, + "grad_norm": 1.3024722876657873, "learning_rate": 8.88008898071249e-06, - "loss": 0.1934, + "loss": 0.1936, "step": 1588 }, { "epoch": 0.2173734610123119, - "grad_norm": 1.4539026533823831, + "grad_norm": 1.3916813397782744, "learning_rate": 8.878733330308931e-06, - "loss": 0.2109, + "loss": 0.211, "step": 1589 }, { "epoch": 0.21751025991792067, - "grad_norm": 1.258991727931609, + "grad_norm": 1.2535184143828326, "learning_rate": 8.877376963506478e-06, - "loss": 0.2021, + "loss": 0.2025, "step": 1590 }, { "epoch": 0.21764705882352942, - "grad_norm": 1.4544658533623127, + "grad_norm": 1.4213649397543668, "learning_rate": 8.87601988055565e-06, - "loss": 0.201, + "loss": 0.2002, "step": 1591 }, { "epoch": 0.21778385772913816, - "grad_norm": 1.2747090795573939, + "grad_norm": 1.2540786770438133, "learning_rate": 8.8746620817071e-06, - "loss": 0.1749, + "loss": 0.1734, "step": 1592 }, { "epoch": 0.2179206566347469, - "grad_norm": 1.3365206564485292, + "grad_norm": 1.3467068044372215, "learning_rate": 8.87330356721161e-06, - "loss": 0.2032, + "loss": 0.2072, "step": 1593 }, { "epoch": 0.2180574555403557, - "grad_norm": 1.5590206274603609, + "grad_norm": 1.526420843949106, "learning_rate": 8.871944337320101e-06, - "loss": 0.2333, + "loss": 0.2316, "step": 1594 }, { "epoch": 0.21819425444596444, - "grad_norm": 1.2806554120881928, + "grad_norm": 1.267278274514648, "learning_rate": 8.870584392283618e-06, - "loss": 0.1824, + "loss": 0.184, "step": 1595 }, { "epoch": 0.21833105335157318, - "grad_norm": 1.476282468746172, + "grad_norm": 1.4558058885552365, "learning_rate": 8.869223732353346e-06, - "loss": 0.2563, + "loss": 0.256, "step": 1596 }, { "epoch": 0.21846785225718193, - "grad_norm": 1.1679231421063598, + "grad_norm": 1.1564414915948704, "learning_rate": 8.86786235778059e-06, - "loss": 0.185, + "loss": 0.1849, "step": 1597 }, { "epoch": 0.2186046511627907, - "grad_norm": 1.6507750049093464, + "grad_norm": 1.6282481712892982, "learning_rate": 8.866500268816803e-06, - "loss": 0.2578, + "loss": 0.2571, "step": 1598 }, { "epoch": 0.21874145006839946, - "grad_norm": 1.26607837935887, + "grad_norm": 1.24476381196565, "learning_rate": 8.865137465713555e-06, - "loss": 0.2098, + "loss": 0.2107, "step": 1599 }, { "epoch": 0.2188782489740082, - "grad_norm": 1.2553686855318489, + "grad_norm": 1.2133266708450516, "learning_rate": 8.863773948722559e-06, - "loss": 0.2054, + "loss": 0.2037, "step": 1600 }, { "epoch": 0.2188782489740082, - "eval_loss": 0.2095678746700287, - "eval_runtime": 5.9273, + "eval_loss": 0.20971006155014038, + "eval_runtime": 5.9274, "eval_samples_per_second": 5.061, "eval_steps_per_second": 1.35, "step": 1600 }, { "epoch": 0.21901504787961695, - "grad_norm": 1.2883472163955845, + "grad_norm": 1.2862772929248731, "learning_rate": 8.862409718095655e-06, - "loss": 0.1761, + "loss": 0.1795, "step": 1601 }, { "epoch": 0.21915184678522573, - "grad_norm": 1.3473931873308342, + "grad_norm": 1.315690580389825, "learning_rate": 8.861044774084815e-06, - "loss": 0.214, + "loss": 0.2133, "step": 1602 }, { "epoch": 0.21928864569083448, - "grad_norm": 1.234712849630236, + "grad_norm": 1.2130406072201954, "learning_rate": 8.859679116942141e-06, - "loss": 0.229, + "loss": 0.2318, "step": 1603 }, { "epoch": 0.21942544459644323, - "grad_norm": 1.1418992866281295, + "grad_norm": 1.1185252107536967, "learning_rate": 8.85831274691987e-06, - "loss": 0.1876, + "loss": 0.1853, "step": 1604 }, { "epoch": 0.21956224350205197, - "grad_norm": 0.9286458415523685, + "grad_norm": 0.9134638941593666, "learning_rate": 8.856945664270372e-06, - "loss": 0.1637, + "loss": 0.1624, "step": 1605 }, { "epoch": 0.21969904240766075, - "grad_norm": 1.351526697132518, + "grad_norm": 1.3280528311341864, "learning_rate": 8.855577869246143e-06, - "loss": 0.2189, + "loss": 0.2149, "step": 1606 }, { "epoch": 0.2198358413132695, - "grad_norm": 1.200542925646252, + "grad_norm": 1.1844597332700677, "learning_rate": 8.854209362099814e-06, - "loss": 0.1618, + "loss": 0.1604, "step": 1607 }, { "epoch": 0.21997264021887825, - "grad_norm": 1.2347046080760713, + "grad_norm": 1.2386045477749223, "learning_rate": 8.852840143084148e-06, - "loss": 0.2085, + "loss": 0.214, "step": 1608 }, { "epoch": 0.220109439124487, - "grad_norm": 1.4835615969977576, + "grad_norm": 1.478344814269661, "learning_rate": 8.851470212452038e-06, - "loss": 0.2198, + "loss": 0.2217, "step": 1609 }, { "epoch": 0.22024623803009577, - "grad_norm": 1.4331340501725174, + "grad_norm": 1.4325770074945012, "learning_rate": 8.85009957045651e-06, - "loss": 0.2063, + "loss": 0.2068, "step": 1610 }, { "epoch": 0.22038303693570452, - "grad_norm": 1.2552240231280218, + "grad_norm": 1.2058200295785182, "learning_rate": 8.84872821735072e-06, - "loss": 0.192, + "loss": 0.1889, "step": 1611 }, { "epoch": 0.22051983584131327, - "grad_norm": 1.3246397263888758, + "grad_norm": 1.3056070467271756, "learning_rate": 8.847356153387955e-06, - "loss": 0.1861, + "loss": 0.1874, "step": 1612 }, { "epoch": 0.22065663474692201, - "grad_norm": 1.2640058549618478, + "grad_norm": 1.2263464877123325, "learning_rate": 8.845983378821635e-06, - "loss": 0.1925, + "loss": 0.191, "step": 1613 }, { "epoch": 0.2207934336525308, - "grad_norm": 1.5375024716103673, + "grad_norm": 1.555288384214261, "learning_rate": 8.844609893905309e-06, - "loss": 0.2413, + "loss": 0.242, "step": 1614 }, { "epoch": 0.22093023255813954, - "grad_norm": 1.3283370189574395, + "grad_norm": 1.3016469149042653, "learning_rate": 8.843235698892661e-06, - "loss": 0.1815, + "loss": 0.1797, "step": 1615 }, { "epoch": 0.2210670314637483, - "grad_norm": 1.251586531290721, + "grad_norm": 1.2364585041506169, "learning_rate": 8.841860794037502e-06, - "loss": 0.1807, + "loss": 0.1788, "step": 1616 }, { "epoch": 0.22120383036935704, - "grad_norm": 1.5047215256957207, + "grad_norm": 1.4874386141974862, "learning_rate": 8.840485179593777e-06, - "loss": 0.1914, + "loss": 0.1945, "step": 1617 }, { "epoch": 0.2213406292749658, - "grad_norm": 1.2598792715450708, + "grad_norm": 1.2353647887569887, "learning_rate": 8.839108855815557e-06, - "loss": 0.205, + "loss": 0.2039, "step": 1618 }, { "epoch": 0.22147742818057456, - "grad_norm": 1.3144635772697557, + "grad_norm": 1.2583005839138977, "learning_rate": 8.837731822957054e-06, - "loss": 0.2272, + "loss": 0.2216, "step": 1619 }, { "epoch": 0.2216142270861833, - "grad_norm": 1.2539823862970285, + "grad_norm": 1.2365576922730714, "learning_rate": 8.8363540812726e-06, - "loss": 0.2003, + "loss": 0.2002, "step": 1620 }, { "epoch": 0.22175102599179206, - "grad_norm": 1.5203126494589638, + "grad_norm": 1.4950529998376672, "learning_rate": 8.834975631016665e-06, - "loss": 0.2515, + "loss": 0.2506, "step": 1621 }, { "epoch": 0.22188782489740083, - "grad_norm": 1.6213600158598984, + "grad_norm": 1.550358979201286, "learning_rate": 8.833596472443848e-06, - "loss": 0.1952, + "loss": 0.1968, "step": 1622 }, { "epoch": 0.22202462380300958, - "grad_norm": 1.309667427643188, + "grad_norm": 1.2635593135269017, "learning_rate": 8.832216605808876e-06, - "loss": 0.2085, + "loss": 0.2087, "step": 1623 }, { "epoch": 0.22216142270861833, - "grad_norm": 1.3484473565941613, + "grad_norm": 1.307040074134342, "learning_rate": 8.830836031366613e-06, - "loss": 0.2355, + "loss": 0.2344, "step": 1624 }, { "epoch": 0.22229822161422708, - "grad_norm": 1.6154068110252033, + "grad_norm": 1.5819324034541347, "learning_rate": 8.829454749372046e-06, - "loss": 0.2334, + "loss": 0.2307, "step": 1625 }, { "epoch": 0.22243502051983585, - "grad_norm": 1.3115226217882947, + "grad_norm": 1.296883743674118, "learning_rate": 8.8280727600803e-06, - "loss": 0.2223, + "loss": 0.2206, "step": 1626 }, { "epoch": 0.2225718194254446, - "grad_norm": 1.1931030103856806, + "grad_norm": 1.1596529933873572, "learning_rate": 8.826690063746626e-06, - "loss": 0.1601, + "loss": 0.1584, "step": 1627 }, { "epoch": 0.22270861833105335, - "grad_norm": 1.598491735169963, + "grad_norm": 1.5721239786631143, "learning_rate": 8.825306660626405e-06, - "loss": 0.2658, + "loss": 0.2667, "step": 1628 }, { "epoch": 0.2228454172366621, - "grad_norm": 1.3790725407594324, + "grad_norm": 1.3556522530837543, "learning_rate": 8.823922550975153e-06, - "loss": 0.2247, + "loss": 0.2258, "step": 1629 }, { "epoch": 0.22298221614227087, - "grad_norm": 1.7409971275817715, + "grad_norm": 1.6844549042942067, "learning_rate": 8.822537735048512e-06, - "loss": 0.2402, + "loss": 0.2397, "step": 1630 }, { "epoch": 0.22311901504787962, - "grad_norm": 1.5720632998856958, + "grad_norm": 1.5282888259255154, "learning_rate": 8.82115221310226e-06, - "loss": 0.236, + "loss": 0.2341, "step": 1631 }, { "epoch": 0.22325581395348837, - "grad_norm": 1.0955866175693951, + "grad_norm": 1.0852186984588386, "learning_rate": 8.819765985392297e-06, - "loss": 0.154, + "loss": 0.1574, "step": 1632 }, { "epoch": 0.22339261285909712, - "grad_norm": 1.3049863543304907, + "grad_norm": 1.2936281452503213, "learning_rate": 8.818379052174659e-06, - "loss": 0.1956, + "loss": 0.1959, "step": 1633 }, { "epoch": 0.2235294117647059, - "grad_norm": 1.0871881750427643, + "grad_norm": 1.0756293776169135, "learning_rate": 8.816991413705515e-06, - "loss": 0.1363, + "loss": 0.1365, "step": 1634 }, { "epoch": 0.22366621067031464, - "grad_norm": 1.2367191955828098, + "grad_norm": 1.2107043677935978, "learning_rate": 8.815603070241158e-06, - "loss": 0.2099, + "loss": 0.211, "step": 1635 }, { "epoch": 0.2238030095759234, - "grad_norm": 1.700496555683887, + "grad_norm": 1.701574800127789, "learning_rate": 8.814214022038014e-06, - "loss": 0.2764, + "loss": 0.2818, "step": 1636 }, { "epoch": 0.22393980848153214, - "grad_norm": 1.3799709618414346, + "grad_norm": 1.3415572099494988, "learning_rate": 8.812824269352642e-06, - "loss": 0.1921, + "loss": 0.1918, "step": 1637 }, { "epoch": 0.2240766073871409, - "grad_norm": 1.2435199814493247, + "grad_norm": 1.2222201045002163, "learning_rate": 8.811433812441723e-06, - "loss": 0.189, + "loss": 0.19, "step": 1638 }, { "epoch": 0.22421340629274966, - "grad_norm": 1.7462361549215681, + "grad_norm": 1.6800094445482603, "learning_rate": 8.810042651562077e-06, - "loss": 0.2645, + "loss": 0.2694, "step": 1639 }, { "epoch": 0.2243502051983584, - "grad_norm": 1.4886545194422534, + "grad_norm": 1.4884319810729216, "learning_rate": 8.808650786970648e-06, - "loss": 0.2187, + "loss": 0.2158, "step": 1640 }, { "epoch": 0.22448700410396716, - "grad_norm": 1.372623272578931, + "grad_norm": 1.326150026528784, "learning_rate": 8.807258218924515e-06, - "loss": 0.1942, + "loss": 0.1911, "step": 1641 }, { "epoch": 0.22462380300957593, - "grad_norm": 1.4138605351123275, + "grad_norm": 1.4069708447589573, "learning_rate": 8.805864947680881e-06, - "loss": 0.2091, + "loss": 0.2057, "step": 1642 }, { "epoch": 0.22476060191518468, - "grad_norm": 1.5057482617973983, + "grad_norm": 1.4454039839981812, "learning_rate": 8.804470973497085e-06, - "loss": 0.2515, + "loss": 0.2491, "step": 1643 }, { "epoch": 0.22489740082079343, - "grad_norm": 1.3256064943647223, + "grad_norm": 1.2976347557074415, "learning_rate": 8.80307629663059e-06, - "loss": 0.1931, + "loss": 0.1899, "step": 1644 }, { "epoch": 0.22503419972640218, - "grad_norm": 1.3324785525753784, + "grad_norm": 1.2897371529891524, "learning_rate": 8.801680917338995e-06, - "loss": 0.1785, + "loss": 0.1779, "step": 1645 }, { "epoch": 0.22517099863201095, - "grad_norm": 1.7097520445949894, + "grad_norm": 1.6833904016260088, "learning_rate": 8.800284835880024e-06, - "loss": 0.2049, + "loss": 0.2089, "step": 1646 }, { "epoch": 0.2253077975376197, - "grad_norm": 1.4512887730332678, + "grad_norm": 1.3984543879236064, "learning_rate": 8.79888805251153e-06, - "loss": 0.2651, + "loss": 0.2613, "step": 1647 }, { "epoch": 0.22544459644322845, - "grad_norm": 1.8606277970143654, + "grad_norm": 1.8304565407114917, "learning_rate": 8.797490567491501e-06, - "loss": 0.2252, + "loss": 0.2235, "step": 1648 }, { "epoch": 0.2255813953488372, - "grad_norm": 1.3370012962777609, + "grad_norm": 1.308941116153925, "learning_rate": 8.79609238107805e-06, - "loss": 0.2108, + "loss": 0.2082, "step": 1649 }, { "epoch": 0.22571819425444598, - "grad_norm": 1.4259868575823342, + "grad_norm": 1.399291060295035, "learning_rate": 8.79469349352942e-06, - "loss": 0.2467, + "loss": 0.2453, "step": 1650 }, { "epoch": 0.22585499316005472, - "grad_norm": 1.2815014483786216, + "grad_norm": 1.2526415322626145, "learning_rate": 8.793293905103986e-06, - "loss": 0.2276, + "loss": 0.2261, "step": 1651 }, { "epoch": 0.22599179206566347, - "grad_norm": 1.3499910655497551, + "grad_norm": 1.3099437334650614, "learning_rate": 8.79189361606025e-06, - "loss": 0.2064, + "loss": 0.2075, "step": 1652 }, { "epoch": 0.22612859097127222, - "grad_norm": 1.4549423879870447, + "grad_norm": 1.4198971070390884, "learning_rate": 8.790492626656843e-06, - "loss": 0.2575, + "loss": 0.2583, "step": 1653 }, { "epoch": 0.226265389876881, - "grad_norm": 1.5637872204661512, + "grad_norm": 1.519626777601102, "learning_rate": 8.78909093715253e-06, - "loss": 0.2581, + "loss": 0.2568, "step": 1654 }, { "epoch": 0.22640218878248974, - "grad_norm": 1.3613986745016144, + "grad_norm": 1.3523737844010735, "learning_rate": 8.787688547806198e-06, - "loss": 0.2259, + "loss": 0.2277, "step": 1655 }, { "epoch": 0.2265389876880985, - "grad_norm": 1.265238986357645, + "grad_norm": 1.235802419475333, "learning_rate": 8.78628545887687e-06, "loss": 0.2249, "step": 1656 }, { "epoch": 0.22667578659370724, - "grad_norm": 1.5385508915407144, + "grad_norm": 1.4638053907159174, "learning_rate": 8.784881670623693e-06, - "loss": 0.2143, + "loss": 0.2076, "step": 1657 }, { "epoch": 0.22681258549931602, - "grad_norm": 1.6871643085282075, + "grad_norm": 1.7076411169269816, "learning_rate": 8.783477183305948e-06, - "loss": 0.291, + "loss": 0.2901, "step": 1658 }, { "epoch": 0.22694938440492476, - "grad_norm": 1.4267537748649473, + "grad_norm": 1.3826906379686337, "learning_rate": 8.782071997183041e-06, - "loss": 0.2169, + "loss": 0.215, "step": 1659 }, { "epoch": 0.2270861833105335, - "grad_norm": 1.2410398060657717, + "grad_norm": 1.2386181240075094, "learning_rate": 8.780666112514511e-06, - "loss": 0.2046, + "loss": 0.206, "step": 1660 }, { "epoch": 0.22722298221614226, - "grad_norm": 1.3908903354586684, + "grad_norm": 1.3741594817704859, "learning_rate": 8.779259529560022e-06, - "loss": 0.1979, + "loss": 0.1993, "step": 1661 }, { "epoch": 0.22735978112175104, - "grad_norm": 1.2646458606308197, + "grad_norm": 1.2007505180632698, "learning_rate": 8.777852248579367e-06, - "loss": 0.205, + "loss": 0.204, "step": 1662 }, { "epoch": 0.22749658002735978, - "grad_norm": 1.1556826217678315, + "grad_norm": 1.1416901707295986, "learning_rate": 8.776444269832472e-06, - "loss": 0.2023, + "loss": 0.2033, "step": 1663 }, { "epoch": 0.22763337893296853, - "grad_norm": 1.5814750081851932, + "grad_norm": 1.5403234353563795, "learning_rate": 8.77503559357939e-06, - "loss": 0.2236, + "loss": 0.224, "step": 1664 }, { "epoch": 0.22777017783857728, - "grad_norm": 1.4444433964246939, + "grad_norm": 1.4002203217226996, "learning_rate": 8.7736262200803e-06, - "loss": 0.2304, + "loss": 0.229, "step": 1665 }, { "epoch": 0.22790697674418606, - "grad_norm": 1.2623774114307094, + "grad_norm": 1.2406150302389907, "learning_rate": 8.772216149595515e-06, - "loss": 0.183, + "loss": 0.1807, "step": 1666 }, { "epoch": 0.2280437756497948, - "grad_norm": 1.2603818465948693, + "grad_norm": 1.2290027886273673, "learning_rate": 8.770805382385471e-06, - "loss": 0.1914, + "loss": 0.1904, "step": 1667 }, { "epoch": 0.22818057455540355, - "grad_norm": 1.4138615633530929, + "grad_norm": 1.3864284836675693, "learning_rate": 8.769393918710737e-06, - "loss": 0.1949, + "loss": 0.1937, "step": 1668 }, { "epoch": 0.2283173734610123, - "grad_norm": 1.4498837470780122, + "grad_norm": 1.398543385327912, "learning_rate": 8.76798175883201e-06, - "loss": 0.2258, + "loss": 0.2269, "step": 1669 }, { "epoch": 0.22845417236662108, - "grad_norm": 1.45517614133694, + "grad_norm": 1.4209774449567405, "learning_rate": 8.766568903010113e-06, - "loss": 0.2225, + "loss": 0.2206, "step": 1670 }, { "epoch": 0.22859097127222983, - "grad_norm": 1.39932882262201, + "grad_norm": 1.3605554220646028, "learning_rate": 8.765155351506002e-06, - "loss": 0.2078, + "loss": 0.2086, "step": 1671 }, { "epoch": 0.22872777017783857, - "grad_norm": 1.220117283249115, + "grad_norm": 1.195442900633287, "learning_rate": 8.763741104580755e-06, - "loss": 0.1696, + "loss": 0.1698, "step": 1672 }, { "epoch": 0.22886456908344732, - "grad_norm": 1.114346975417978, + "grad_norm": 1.0940344954014505, "learning_rate": 8.762326162495585e-06, - "loss": 0.1762, + "loss": 0.1752, "step": 1673 }, { "epoch": 0.2290013679890561, - "grad_norm": 1.0513915280623867, + "grad_norm": 1.0332935368071487, "learning_rate": 8.76091052551183e-06, - "loss": 0.1887, + "loss": 0.1878, "step": 1674 }, { "epoch": 0.22913816689466485, - "grad_norm": 1.5162882539384597, + "grad_norm": 1.495660847544595, "learning_rate": 8.759494193890958e-06, - "loss": 0.2733, + "loss": 0.274, "step": 1675 }, { "epoch": 0.2292749658002736, - "grad_norm": 1.404022525592945, + "grad_norm": 1.388145097098709, "learning_rate": 8.758077167894562e-06, - "loss": 0.2313, + "loss": 0.2327, "step": 1676 }, { "epoch": 0.22941176470588234, - "grad_norm": 1.673448636248197, + "grad_norm": 1.664220400020878, "learning_rate": 8.756659447784367e-06, - "loss": 0.2076, + "loss": 0.2092, "step": 1677 }, { "epoch": 0.22954856361149112, - "grad_norm": 1.373309785012038, + "grad_norm": 1.3481008411135573, "learning_rate": 8.755241033822225e-06, - "loss": 0.226, + "loss": 0.2284, "step": 1678 }, { "epoch": 0.22968536251709987, - "grad_norm": 1.4397225220295007, + "grad_norm": 1.4054910284966935, "learning_rate": 8.753821926270115e-06, - "loss": 0.1972, + "loss": 0.1988, "step": 1679 }, { "epoch": 0.22982216142270862, - "grad_norm": 1.5107436027782986, + "grad_norm": 1.4812239753585132, "learning_rate": 8.752402125390146e-06, - "loss": 0.2586, + "loss": 0.2579, "step": 1680 }, { "epoch": 0.22995896032831736, - "grad_norm": 1.2713558818709625, + "grad_norm": 1.2329542811259768, "learning_rate": 8.750981631444552e-06, - "loss": 0.1958, + "loss": 0.1951, "step": 1681 }, { "epoch": 0.23009575923392614, - "grad_norm": 1.026478643329133, + "grad_norm": 1.014291599651125, "learning_rate": 8.749560444695701e-06, - "loss": 0.1832, + "loss": 0.1844, "step": 1682 }, { "epoch": 0.2302325581395349, - "grad_norm": 1.3874701492431492, + "grad_norm": 1.3842271113818043, "learning_rate": 8.74813856540608e-06, - "loss": 0.2188, + "loss": 0.2202, "step": 1683 }, { "epoch": 0.23036935704514364, - "grad_norm": 1.4247049185253517, + "grad_norm": 1.386007541932447, "learning_rate": 8.746715993838314e-06, - "loss": 0.2279, + "loss": 0.228, "step": 1684 }, { "epoch": 0.23050615595075238, - "grad_norm": 1.6599864674181937, + "grad_norm": 1.6538512994045953, "learning_rate": 8.745292730255148e-06, - "loss": 0.2244, + "loss": 0.2284, "step": 1685 }, { "epoch": 0.23064295485636116, - "grad_norm": 1.4460841121385126, + "grad_norm": 1.4230764692208748, "learning_rate": 8.743868774919458e-06, - "loss": 0.2165, + "loss": 0.2175, "step": 1686 }, { "epoch": 0.2307797537619699, - "grad_norm": 1.3834498639825887, + "grad_norm": 1.3599148209230698, "learning_rate": 8.742444128094246e-06, - "loss": 0.1772, + "loss": 0.1797, "step": 1687 }, { "epoch": 0.23091655266757866, - "grad_norm": 1.4599484723033145, + "grad_norm": 1.4251075992055684, "learning_rate": 8.741018790042645e-06, - "loss": 0.2275, + "loss": 0.2277, "step": 1688 }, { "epoch": 0.2310533515731874, - "grad_norm": 1.5492916498665665, + "grad_norm": 1.5056303748292887, "learning_rate": 8.739592761027915e-06, - "loss": 0.2848, + "loss": 0.2856, "step": 1689 }, { "epoch": 0.23119015047879618, - "grad_norm": 1.525646259000192, + "grad_norm": 1.5221405233837768, "learning_rate": 8.73816604131344e-06, - "loss": 0.2328, + "loss": 0.233, "step": 1690 }, { "epoch": 0.23132694938440493, - "grad_norm": 1.4588561325130058, + "grad_norm": 1.4061397693722988, "learning_rate": 8.736738631162733e-06, - "loss": 0.2525, + "loss": 0.2449, "step": 1691 }, { "epoch": 0.23146374829001368, - "grad_norm": 1.4392122526669835, + "grad_norm": 1.4048181828890285, "learning_rate": 8.73531053083944e-06, - "loss": 0.2245, + "loss": 0.2296, "step": 1692 }, { "epoch": 0.23160054719562242, - "grad_norm": 1.3268706195654527, + "grad_norm": 1.3124560653474944, "learning_rate": 8.733881740607326e-06, - "loss": 0.1843, + "loss": 0.1854, "step": 1693 }, { "epoch": 0.2317373461012312, - "grad_norm": 1.2395646976159458, + "grad_norm": 1.2247906958991766, "learning_rate": 8.732452260730287e-06, - "loss": 0.1996, + "loss": 0.2013, "step": 1694 }, { "epoch": 0.23187414500683995, - "grad_norm": 1.306878674404851, + "grad_norm": 1.2895980381254917, "learning_rate": 8.73102209147235e-06, - "loss": 0.1869, + "loss": 0.1875, "step": 1695 }, { "epoch": 0.2320109439124487, - "grad_norm": 1.4740113873787717, + "grad_norm": 1.4360963825544844, "learning_rate": 8.729591233097664e-06, - "loss": 0.1714, + "loss": 0.1703, "step": 1696 }, { "epoch": 0.23214774281805745, - "grad_norm": 1.2131178002571232, + "grad_norm": 1.1639269331308904, "learning_rate": 8.728159685870508e-06, - "loss": 0.1755, + "loss": 0.1719, "step": 1697 }, { "epoch": 0.23228454172366622, - "grad_norm": 1.3899872393653883, + "grad_norm": 1.3716975471230988, "learning_rate": 8.726727450055288e-06, - "loss": 0.2173, + "loss": 0.2155, "step": 1698 }, { "epoch": 0.23242134062927497, - "grad_norm": 1.6404057382684512, + "grad_norm": 1.5972279249199979, "learning_rate": 8.725294525916534e-06, - "loss": 0.2787, + "loss": 0.2768, "step": 1699 }, { "epoch": 0.23255813953488372, - "grad_norm": 1.7120620532691977, + "grad_norm": 1.6687390332862462, "learning_rate": 8.72386091371891e-06, - "loss": 0.2339, + "loss": 0.2319, "step": 1700 }, { "epoch": 0.23255813953488372, - "eval_loss": 0.2102040946483612, - "eval_runtime": 5.8987, - "eval_samples_per_second": 5.086, - "eval_steps_per_second": 1.356, + "eval_loss": 0.2108483761548996, + "eval_runtime": 5.9158, + "eval_samples_per_second": 5.071, + "eval_steps_per_second": 1.352, "step": 1700 }, { "epoch": 0.23269493844049247, - "grad_norm": 1.560025847139737, + "grad_norm": 1.5658755318898596, "learning_rate": 8.722426613727201e-06, - "loss": 0.2189, + "loss": 0.2243, "step": 1701 }, { "epoch": 0.23283173734610124, - "grad_norm": 1.2351644250685365, + "grad_norm": 1.2104232053581576, "learning_rate": 8.72099162620632e-06, - "loss": 0.1696, + "loss": 0.1685, "step": 1702 }, { "epoch": 0.23296853625171, - "grad_norm": 1.5187103986314356, + "grad_norm": 1.511063141713176, "learning_rate": 8.719555951421312e-06, - "loss": 0.2255, + "loss": 0.2236, "step": 1703 }, { "epoch": 0.23310533515731874, - "grad_norm": 1.4722134435519147, + "grad_norm": 1.448257121915469, "learning_rate": 8.71811958963734e-06, - "loss": 0.2458, + "loss": 0.2473, "step": 1704 }, { "epoch": 0.2332421340629275, - "grad_norm": 1.3937932271576874, + "grad_norm": 1.368992052079563, "learning_rate": 8.7166825411197e-06, - "loss": 0.2086, + "loss": 0.2081, "step": 1705 }, { "epoch": 0.23337893296853626, - "grad_norm": 1.181314781482352, + "grad_norm": 1.17611947672189, "learning_rate": 8.715244806133817e-06, - "loss": 0.1759, + "loss": 0.1758, "step": 1706 }, { "epoch": 0.233515731874145, - "grad_norm": 1.086935466744372, + "grad_norm": 1.0724208860390692, "learning_rate": 8.713806384945235e-06, - "loss": 0.1755, + "loss": 0.1789, "step": 1707 }, { "epoch": 0.23365253077975376, - "grad_norm": 1.5582857621662283, + "grad_norm": 1.4795118343289813, "learning_rate": 8.712367277819635e-06, - "loss": 0.2245, + "loss": 0.2256, "step": 1708 }, { "epoch": 0.2337893296853625, - "grad_norm": 1.3847738044995697, + "grad_norm": 1.369981966860982, "learning_rate": 8.710927485022813e-06, - "loss": 0.2151, + "loss": 0.2149, "step": 1709 }, { "epoch": 0.23392612859097128, - "grad_norm": 1.3495547718710994, + "grad_norm": 1.3461390916192437, "learning_rate": 8.7094870068207e-06, - "loss": 0.1879, + "loss": 0.1896, "step": 1710 }, { "epoch": 0.23406292749658003, - "grad_norm": 1.521031168057449, + "grad_norm": 1.4563265244139927, "learning_rate": 8.708045843479351e-06, - "loss": 0.1891, + "loss": 0.1864, "step": 1711 }, { "epoch": 0.23419972640218878, - "grad_norm": 1.6906753770639737, + "grad_norm": 1.6881714508603898, "learning_rate": 8.706603995264949e-06, - "loss": 0.2591, + "loss": 0.2578, "step": 1712 }, { "epoch": 0.23433652530779753, - "grad_norm": 1.6425731704620399, + "grad_norm": 1.5965896510472415, "learning_rate": 8.705161462443798e-06, - "loss": 0.2575, + "loss": 0.2549, "step": 1713 }, { "epoch": 0.2344733242134063, - "grad_norm": 1.3111390410238195, + "grad_norm": 1.2664252687151847, "learning_rate": 8.703718245282337e-06, - "loss": 0.2121, + "loss": 0.2099, "step": 1714 }, { "epoch": 0.23461012311901505, - "grad_norm": 1.4160763073357487, + "grad_norm": 1.384994354073245, "learning_rate": 8.702274344047128e-06, - "loss": 0.2336, + "loss": 0.2322, "step": 1715 }, { "epoch": 0.2347469220246238, - "grad_norm": 1.010798897638915, + "grad_norm": 0.9987693879339531, "learning_rate": 8.700829759004854e-06, - "loss": 0.164, + "loss": 0.1658, "step": 1716 }, { "epoch": 0.23488372093023255, - "grad_norm": 1.1262080729959327, + "grad_norm": 1.1117847990590914, "learning_rate": 8.699384490422332e-06, - "loss": 0.1669, + "loss": 0.166, "step": 1717 }, { "epoch": 0.23502051983584132, - "grad_norm": 1.3507862865342166, + "grad_norm": 1.2244504879658336, "learning_rate": 8.697938538566498e-06, - "loss": 0.2033, + "loss": 0.2034, "step": 1718 }, { "epoch": 0.23515731874145007, - "grad_norm": 1.265553128288898, + "grad_norm": 1.2356737250696888, "learning_rate": 8.696491903704423e-06, - "loss": 0.1891, + "loss": 0.1874, "step": 1719 }, { "epoch": 0.23529411764705882, - "grad_norm": 1.4944376018764904, + "grad_norm": 1.4943729575182056, "learning_rate": 8.695044586103297e-06, - "loss": 0.2013, + "loss": 0.2028, "step": 1720 }, { "epoch": 0.23543091655266757, - "grad_norm": 1.2461157893232282, + "grad_norm": 1.2324925690371866, "learning_rate": 8.693596586030437e-06, - "loss": 0.2068, + "loss": 0.2062, "step": 1721 }, { "epoch": 0.23556771545827634, - "grad_norm": 1.4980486813499678, + "grad_norm": 1.4232895758558157, "learning_rate": 8.69214790375329e-06, - "loss": 0.2021, + "loss": 0.1979, "step": 1722 }, { "epoch": 0.2357045143638851, - "grad_norm": 1.1831563923935735, + "grad_norm": 1.168471252786152, "learning_rate": 8.690698539539426e-06, - "loss": 0.1952, + "loss": 0.1922, "step": 1723 }, { "epoch": 0.23584131326949384, - "grad_norm": 1.6091227212411385, + "grad_norm": 1.6494905772765032, "learning_rate": 8.689248493656539e-06, - "loss": 0.2218, + "loss": 0.2234, "step": 1724 }, { "epoch": 0.2359781121751026, - "grad_norm": 1.466044204445715, + "grad_norm": 1.441518060962051, "learning_rate": 8.687797766372455e-06, - "loss": 0.2331, + "loss": 0.2304, "step": 1725 }, { "epoch": 0.23611491108071136, - "grad_norm": 1.3034583220418634, + "grad_norm": 1.3023728635951883, "learning_rate": 8.686346357955118e-06, - "loss": 0.1689, + "loss": 0.1704, "step": 1726 }, { "epoch": 0.2362517099863201, - "grad_norm": 1.174272993978214, + "grad_norm": 1.1592652621155608, "learning_rate": 8.684894268672604e-06, - "loss": 0.1776, + "loss": 0.1788, "step": 1727 }, { "epoch": 0.23638850889192886, - "grad_norm": 1.3645197539828968, + "grad_norm": 1.3319755887074882, "learning_rate": 8.683441498793114e-06, - "loss": 0.1872, + "loss": 0.186, "step": 1728 }, { "epoch": 0.2365253077975376, - "grad_norm": 1.553565947223099, + "grad_norm": 1.5164062460577474, "learning_rate": 8.681988048584973e-06, - "loss": 0.242, + "loss": 0.2415, "step": 1729 }, { "epoch": 0.23666210670314639, - "grad_norm": 1.1189135027625372, + "grad_norm": 1.1462908216861336, "learning_rate": 8.68053391831663e-06, - "loss": 0.2005, + "loss": 0.2022, "step": 1730 }, { "epoch": 0.23679890560875513, - "grad_norm": 1.3865306141132407, + "grad_norm": 1.3449010517726432, "learning_rate": 8.679079108256664e-06, - "loss": 0.2029, + "loss": 0.2004, "step": 1731 }, { "epoch": 0.23693570451436388, - "grad_norm": 1.2721422276722965, + "grad_norm": 1.23643419538652, "learning_rate": 8.677623618673775e-06, - "loss": 0.2202, + "loss": 0.2201, "step": 1732 }, { "epoch": 0.23707250341997263, - "grad_norm": 1.5178445336143904, + "grad_norm": 1.4914825068352395, "learning_rate": 8.676167449836794e-06, - "loss": 0.2076, + "loss": 0.204, "step": 1733 }, { "epoch": 0.2372093023255814, - "grad_norm": 1.27371948348964, + "grad_norm": 1.2640079901939685, "learning_rate": 8.674710602014672e-06, - "loss": 0.2015, + "loss": 0.2007, "step": 1734 }, { "epoch": 0.23734610123119015, - "grad_norm": 1.2612405009975913, + "grad_norm": 1.2198432914208701, "learning_rate": 8.673253075476484e-06, - "loss": 0.1947, + "loss": 0.1961, "step": 1735 }, { "epoch": 0.2374829001367989, - "grad_norm": 1.286855085704633, + "grad_norm": 1.2829643571555251, "learning_rate": 8.67179487049144e-06, - "loss": 0.2013, + "loss": 0.2027, "step": 1736 }, { "epoch": 0.23761969904240765, - "grad_norm": 1.1971895618584785, + "grad_norm": 1.16449669161053, "learning_rate": 8.670335987328869e-06, - "loss": 0.2188, + "loss": 0.2174, "step": 1737 }, { "epoch": 0.23775649794801643, - "grad_norm": 1.7222843355090875, + "grad_norm": 1.3173932217828577, "learning_rate": 8.668876426258221e-06, - "loss": 0.2006, + "loss": 0.2019, "step": 1738 }, { "epoch": 0.23789329685362517, - "grad_norm": 1.3704648446381955, + "grad_norm": 1.3625092679963393, "learning_rate": 8.66741618754908e-06, - "loss": 0.2155, + "loss": 0.2173, "step": 1739 }, { "epoch": 0.23803009575923392, - "grad_norm": 1.2319297888259766, + "grad_norm": 1.2191541491951634, "learning_rate": 8.665955271471149e-06, - "loss": 0.1767, + "loss": 0.1728, "step": 1740 }, { "epoch": 0.23816689466484267, - "grad_norm": 1.3776013258327884, + "grad_norm": 1.2693214845891727, "learning_rate": 8.664493678294257e-06, - "loss": 0.1963, + "loss": 0.1931, "step": 1741 }, { "epoch": 0.23830369357045145, - "grad_norm": 1.413039202262535, + "grad_norm": 1.3701899247301634, "learning_rate": 8.66303140828836e-06, "loss": 0.2206, "step": 1742 }, { "epoch": 0.2384404924760602, - "grad_norm": 1.3888933748825696, + "grad_norm": 1.3948122455323708, "learning_rate": 8.66156846172354e-06, - "loss": 0.2058, + "loss": 0.2073, "step": 1743 }, { "epoch": 0.23857729138166894, - "grad_norm": 1.5315055457624995, + "grad_norm": 1.5066133376901711, "learning_rate": 8.66010483887e-06, - "loss": 0.238, + "loss": 0.2344, "step": 1744 }, { "epoch": 0.2387140902872777, - "grad_norm": 1.3693401031707138, + "grad_norm": 1.3407385701286374, "learning_rate": 8.658640539998071e-06, - "loss": 0.1759, + "loss": 0.1721, "step": 1745 }, { "epoch": 0.23885088919288647, - "grad_norm": 1.4842334810301576, + "grad_norm": 1.4502041324730401, "learning_rate": 8.657175565378206e-06, - "loss": 0.2188, + "loss": 0.2163, "step": 1746 }, { "epoch": 0.23898768809849522, - "grad_norm": 1.5237580649122913, + "grad_norm": 1.5294637613674111, "learning_rate": 8.655709915280986e-06, - "loss": 0.2248, + "loss": 0.2241, "step": 1747 }, { "epoch": 0.23912448700410396, - "grad_norm": 1.1606517296376997, + "grad_norm": 1.2122075883869596, "learning_rate": 8.654243589977117e-06, - "loss": 0.1644, + "loss": 0.1653, "step": 1748 }, { "epoch": 0.2392612859097127, - "grad_norm": 1.2910237367213127, + "grad_norm": 1.2563833015050865, "learning_rate": 8.652776589737424e-06, - "loss": 0.1878, + "loss": 0.1871, "step": 1749 }, { "epoch": 0.2393980848153215, - "grad_norm": 1.4220981237078474, + "grad_norm": 1.4135731803372764, "learning_rate": 8.651308914832863e-06, - "loss": 0.2143, + "loss": 0.2164, "step": 1750 }, { "epoch": 0.23953488372093024, - "grad_norm": 1.3829120673863933, + "grad_norm": 1.3614550377839354, "learning_rate": 8.649840565534513e-06, - "loss": 0.1863, + "loss": 0.1867, "step": 1751 }, { "epoch": 0.23967168262653898, - "grad_norm": 1.5422353463543292, + "grad_norm": 1.4924674443533037, "learning_rate": 8.648371542113576e-06, - "loss": 0.2565, + "loss": 0.2515, "step": 1752 }, { "epoch": 0.23980848153214773, - "grad_norm": 1.0724191877155629, + "grad_norm": 1.0155690602520486, "learning_rate": 8.64690184484138e-06, - "loss": 0.1466, + "loss": 0.144, "step": 1753 }, { "epoch": 0.2399452804377565, - "grad_norm": 1.4136820897915578, + "grad_norm": 1.4116223460752142, "learning_rate": 8.645431473989376e-06, - "loss": 0.2517, + "loss": 0.2502, "step": 1754 }, { "epoch": 0.24008207934336526, - "grad_norm": 1.2160562663365724, + "grad_norm": 1.1809935950037, "learning_rate": 8.643960429829142e-06, - "loss": 0.196, + "loss": 0.195, "step": 1755 }, { "epoch": 0.240218878248974, - "grad_norm": 1.1930693558641423, + "grad_norm": 1.2508987508006837, "learning_rate": 8.642488712632377e-06, - "loss": 0.1814, + "loss": 0.1823, "step": 1756 }, { "epoch": 0.24035567715458275, - "grad_norm": 1.1776237288187956, + "grad_norm": 1.1674555922899017, "learning_rate": 8.641016322670907e-06, - "loss": 0.2201, + "loss": 0.2185, "step": 1757 }, { "epoch": 0.24049247606019153, - "grad_norm": 1.413982339763288, + "grad_norm": 1.4031657972938243, "learning_rate": 8.63954326021668e-06, - "loss": 0.2342, + "loss": 0.2336, "step": 1758 }, { "epoch": 0.24062927496580028, - "grad_norm": 1.5792777286899877, + "grad_norm": 1.6431787867693122, "learning_rate": 8.638069525541773e-06, - "loss": 0.223, + "loss": 0.2259, "step": 1759 }, { "epoch": 0.24076607387140903, - "grad_norm": 1.1703947912053942, + "grad_norm": 1.141150559781098, "learning_rate": 8.636595118918378e-06, - "loss": 0.1692, + "loss": 0.1672, "step": 1760 }, { "epoch": 0.24090287277701777, - "grad_norm": 1.0526862475788576, + "grad_norm": 1.070607922404379, "learning_rate": 8.63512004061882e-06, - "loss": 0.1895, + "loss": 0.189, "step": 1761 }, { "epoch": 0.24103967168262655, - "grad_norm": 1.5270916564565062, + "grad_norm": 1.5661876013451361, "learning_rate": 8.633644290915545e-06, - "loss": 0.2348, + "loss": 0.2397, "step": 1762 }, { "epoch": 0.2411764705882353, - "grad_norm": 1.5339599502686982, + "grad_norm": 1.5080503380757828, "learning_rate": 8.632167870081122e-06, - "loss": 0.2081, + "loss": 0.207, "step": 1763 }, { "epoch": 0.24131326949384405, - "grad_norm": 1.3509071256930156, + "grad_norm": 1.3335936124791627, "learning_rate": 8.630690778388245e-06, - "loss": 0.1907, + "loss": 0.1892, "step": 1764 }, { "epoch": 0.2414500683994528, - "grad_norm": 1.2252881932597988, + "grad_norm": 1.1784313554263193, "learning_rate": 8.62921301610973e-06, - "loss": 0.2205, + "loss": 0.222, "step": 1765 }, { "epoch": 0.24158686730506157, - "grad_norm": 1.2676274137504775, + "grad_norm": 1.2382192361612347, "learning_rate": 8.62773458351852e-06, - "loss": 0.2145, + "loss": 0.2178, "step": 1766 }, { "epoch": 0.24172366621067032, - "grad_norm": 1.4451462003549294, + "grad_norm": 1.4091084704845769, "learning_rate": 8.626255480887681e-06, - "loss": 0.2201, + "loss": 0.2162, "step": 1767 }, { "epoch": 0.24186046511627907, - "grad_norm": 1.6662611267715306, + "grad_norm": 1.6651186337435062, "learning_rate": 8.624775708490403e-06, - "loss": 0.2279, + "loss": 0.2308, "step": 1768 }, { "epoch": 0.24199726402188781, - "grad_norm": 1.583535879885267, + "grad_norm": 1.5568224161315187, "learning_rate": 8.623295266599995e-06, - "loss": 0.1995, + "loss": 0.1951, "step": 1769 }, { "epoch": 0.2421340629274966, - "grad_norm": 1.4191547480526003, + "grad_norm": 1.3937246618475614, "learning_rate": 8.621814155489897e-06, - "loss": 0.1895, + "loss": 0.1832, "step": 1770 }, { "epoch": 0.24227086183310534, - "grad_norm": 1.4426793686896289, + "grad_norm": 1.4505821668741, "learning_rate": 8.620332375433664e-06, - "loss": 0.2244, + "loss": 0.2255, "step": 1771 }, { "epoch": 0.2424076607387141, - "grad_norm": 1.4883151264486754, + "grad_norm": 1.4931469397985584, "learning_rate": 8.618849926704987e-06, - "loss": 0.2022, + "loss": 0.2072, "step": 1772 }, { "epoch": 0.24254445964432284, - "grad_norm": 1.6177877453840885, + "grad_norm": 1.6060691415951682, "learning_rate": 8.617366809577666e-06, - "loss": 0.2793, + "loss": 0.2754, "step": 1773 }, { "epoch": 0.2426812585499316, - "grad_norm": 1.265145183964269, + "grad_norm": 1.2749297548810818, "learning_rate": 8.615883024325636e-06, - "loss": 0.1968, + "loss": 0.1957, "step": 1774 }, { "epoch": 0.24281805745554036, - "grad_norm": 1.0143643457901421, + "grad_norm": 1.0210229557054007, "learning_rate": 8.614398571222951e-06, - "loss": 0.1444, + "loss": 0.1445, "step": 1775 }, { "epoch": 0.2429548563611491, - "grad_norm": 1.1534766865989048, + "grad_norm": 1.1228549338600304, "learning_rate": 8.612913450543785e-06, - "loss": 0.1691, + "loss": 0.1684, "step": 1776 }, { "epoch": 0.24309165526675786, - "grad_norm": 1.3811357165625096, + "grad_norm": 1.3754695121744176, "learning_rate": 8.611427662562442e-06, - "loss": 0.2089, + "loss": 0.2092, "step": 1777 }, { "epoch": 0.24322845417236663, - "grad_norm": 1.5577359722876136, + "grad_norm": 1.5577735455891817, "learning_rate": 8.609941207553342e-06, - "loss": 0.24, + "loss": 0.2397, "step": 1778 }, { "epoch": 0.24336525307797538, - "grad_norm": 1.2726548926626187, + "grad_norm": 1.2253782836194325, "learning_rate": 8.608454085791036e-06, - "loss": 0.1983, + "loss": 0.1942, "step": 1779 }, { "epoch": 0.24350205198358413, - "grad_norm": 1.0668081402810137, + "grad_norm": 1.0602704434424381, "learning_rate": 8.606966297550192e-06, "loss": 0.1683, "step": 1780 }, { "epoch": 0.24363885088919288, - "grad_norm": 1.3296569581241153, + "grad_norm": 1.3066854466112559, "learning_rate": 8.605477843105604e-06, - "loss": 0.2145, + "loss": 0.2097, "step": 1781 }, { "epoch": 0.24377564979480165, - "grad_norm": 1.2913131933518465, + "grad_norm": 1.2540362290390283, "learning_rate": 8.603988722732187e-06, - "loss": 0.1656, + "loss": 0.1635, "step": 1782 }, { "epoch": 0.2439124487004104, - "grad_norm": 1.2205457129059543, + "grad_norm": 1.2166251993356876, "learning_rate": 8.602498936704982e-06, - "loss": 0.1687, + "loss": 0.1699, "step": 1783 }, { "epoch": 0.24404924760601915, - "grad_norm": 1.3170622667202698, + "grad_norm": 1.2852160627532905, "learning_rate": 8.601008485299151e-06, - "loss": 0.202, + "loss": 0.1985, "step": 1784 }, { "epoch": 0.2441860465116279, - "grad_norm": 1.2513559440663142, + "grad_norm": 1.2561353613139055, "learning_rate": 8.599517368789981e-06, - "loss": 0.1874, + "loss": 0.1901, "step": 1785 }, { "epoch": 0.24432284541723667, - "grad_norm": 1.3608137075658353, + "grad_norm": 1.362473446170697, "learning_rate": 8.598025587452873e-06, - "loss": 0.2115, + "loss": 0.212, "step": 1786 }, { "epoch": 0.24445964432284542, - "grad_norm": 1.6857591834759695, + "grad_norm": 1.711793288519279, "learning_rate": 8.596533141563368e-06, - "loss": 0.2489, + "loss": 0.2535, "step": 1787 }, { "epoch": 0.24459644322845417, - "grad_norm": 1.4317188188626997, + "grad_norm": 1.4509802661299958, "learning_rate": 8.59504003139711e-06, - "loss": 0.1984, + "loss": 0.1989, "step": 1788 }, { "epoch": 0.24473324213406292, - "grad_norm": 1.2653386258405697, + "grad_norm": 1.2236078455485393, "learning_rate": 8.593546257229882e-06, - "loss": 0.1998, + "loss": 0.1971, "step": 1789 }, { "epoch": 0.2448700410396717, - "grad_norm": 1.1692608611326365, + "grad_norm": 1.1593641036894475, "learning_rate": 8.592051819337578e-06, - "loss": 0.1938, + "loss": 0.1949, "step": 1790 }, { "epoch": 0.24500683994528044, - "grad_norm": 1.634112188358528, + "grad_norm": 1.6482455139098497, "learning_rate": 8.590556717996224e-06, - "loss": 0.2362, + "loss": 0.2376, "step": 1791 }, { "epoch": 0.2451436388508892, - "grad_norm": 1.5608695860287107, + "grad_norm": 1.5268027220625406, "learning_rate": 8.589060953481962e-06, - "loss": 0.2454, + "loss": 0.2427, "step": 1792 }, { "epoch": 0.24528043775649794, - "grad_norm": 1.1537118949166711, + "grad_norm": 1.1503779040441384, "learning_rate": 8.587564526071058e-06, - "loss": 0.1642, + "loss": 0.1638, "step": 1793 }, { "epoch": 0.2454172366621067, - "grad_norm": 1.1772668281116252, + "grad_norm": 1.1505545910028168, "learning_rate": 8.5860674360399e-06, - "loss": 0.1977, + "loss": 0.1958, "step": 1794 }, { "epoch": 0.24555403556771546, - "grad_norm": 1.518472057792923, + "grad_norm": 1.5018789061502924, "learning_rate": 8.584569683665e-06, - "loss": 0.2361, + "loss": 0.2316, "step": 1795 }, { "epoch": 0.2456908344733242, - "grad_norm": 1.6400782247421544, + "grad_norm": 1.6402156211538788, "learning_rate": 8.583071269222993e-06, - "loss": 0.2535, + "loss": 0.2539, "step": 1796 }, { "epoch": 0.24582763337893296, - "grad_norm": 1.409579071596375, + "grad_norm": 1.3954937726151146, "learning_rate": 8.581572192990636e-06, - "loss": 0.1821, + "loss": 0.1845, "step": 1797 }, { "epoch": 0.24596443228454173, - "grad_norm": 1.5119593675667653, + "grad_norm": 1.505768491627887, "learning_rate": 8.580072455244801e-06, - "loss": 0.2365, + "loss": 0.2398, "step": 1798 }, { "epoch": 0.24610123119015048, - "grad_norm": 1.4997078128657872, + "grad_norm": 1.5008992755535275, "learning_rate": 8.578572056262496e-06, - "loss": 0.2522, + "loss": 0.2497, "step": 1799 }, { "epoch": 0.24623803009575923, - "grad_norm": 1.5894780450709032, + "grad_norm": 1.5935449602828162, "learning_rate": 8.57707099632084e-06, - "loss": 0.2152, + "loss": 0.2162, "step": 1800 }, { "epoch": 0.24623803009575923, - "eval_loss": 0.21002759039402008, - "eval_runtime": 5.9106, - "eval_samples_per_second": 5.076, + "eval_loss": 0.20931176841259003, + "eval_runtime": 5.909, + "eval_samples_per_second": 5.077, "eval_steps_per_second": 1.354, "step": 1800 }, { "epoch": 0.24637482900136798, - "grad_norm": 1.4285962434340302, + "grad_norm": 1.4158135680452972, "learning_rate": 8.575569275697076e-06, - "loss": 0.2164, + "loss": 0.2116, "step": 1801 }, { "epoch": 0.24651162790697675, - "grad_norm": 1.5048313752913354, + "grad_norm": 1.5258184493588063, "learning_rate": 8.574066894668573e-06, - "loss": 0.2029, + "loss": 0.2045, "step": 1802 }, { "epoch": 0.2466484268125855, - "grad_norm": 1.5254982293422568, + "grad_norm": 1.4679167401805782, "learning_rate": 8.572563853512819e-06, - "loss": 0.2151, + "loss": 0.2133, "step": 1803 }, { "epoch": 0.24678522571819425, - "grad_norm": 1.5933074502219908, + "grad_norm": 1.5877035045105476, "learning_rate": 8.571060152507424e-06, - "loss": 0.2082, + "loss": 0.2069, "step": 1804 }, { "epoch": 0.246922024623803, - "grad_norm": 1.6432573013560132, + "grad_norm": 1.6342496873547592, "learning_rate": 8.56955579193012e-06, - "loss": 0.2165, + "loss": 0.2169, "step": 1805 }, { "epoch": 0.24705882352941178, - "grad_norm": 1.233169701581751, + "grad_norm": 1.2364648590097642, "learning_rate": 8.568050772058763e-06, - "loss": 0.2051, + "loss": 0.2062, "step": 1806 }, { "epoch": 0.24719562243502052, - "grad_norm": 1.2698985846547695, + "grad_norm": 1.2584155559422576, "learning_rate": 8.566545093171325e-06, - "loss": 0.1957, + "loss": 0.1964, "step": 1807 }, { "epoch": 0.24733242134062927, - "grad_norm": 1.506392809667106, + "grad_norm": 1.4932324762617637, "learning_rate": 8.565038755545909e-06, - "loss": 0.2237, + "loss": 0.2242, "step": 1808 }, { "epoch": 0.24746922024623802, - "grad_norm": 1.1218457462908362, + "grad_norm": 1.10397875922097, "learning_rate": 8.563531759460733e-06, - "loss": 0.1772, + "loss": 0.1762, "step": 1809 }, { "epoch": 0.2476060191518468, - "grad_norm": 1.5098328564056025, + "grad_norm": 1.440586341494243, "learning_rate": 8.562024105194134e-06, - "loss": 0.2058, + "loss": 0.206, "step": 1810 }, { "epoch": 0.24774281805745554, - "grad_norm": 1.4268426552876958, + "grad_norm": 1.5764516714214374, "learning_rate": 8.560515793024578e-06, "loss": 0.2078, "step": 1811 }, { "epoch": 0.2478796169630643, - "grad_norm": 1.3163265355671645, + "grad_norm": 1.3077929229045429, "learning_rate": 8.559006823230647e-06, - "loss": 0.2294, + "loss": 0.2314, "step": 1812 }, { "epoch": 0.24801641586867304, - "grad_norm": 1.4633059377240192, + "grad_norm": 1.4190118518371453, "learning_rate": 8.557497196091049e-06, - "loss": 0.2414, + "loss": 0.242, "step": 1813 }, { "epoch": 0.24815321477428182, - "grad_norm": 1.2879664684788974, + "grad_norm": 1.2823292711856338, "learning_rate": 8.55598691188461e-06, - "loss": 0.1855, + "loss": 0.1874, "step": 1814 }, { "epoch": 0.24829001367989056, - "grad_norm": 1.3646751695399282, + "grad_norm": 1.3322528723430938, "learning_rate": 8.554475970890276e-06, - "loss": 0.2156, + "loss": 0.2161, "step": 1815 }, { "epoch": 0.2484268125854993, - "grad_norm": 1.1422177976108376, + "grad_norm": 1.139786789785628, "learning_rate": 8.55296437338712e-06, - "loss": 0.168, + "loss": 0.17, "step": 1816 }, { "epoch": 0.24856361149110806, - "grad_norm": 1.431185889345538, + "grad_norm": 1.3957845199178487, "learning_rate": 8.551452119654332e-06, - "loss": 0.2272, + "loss": 0.2248, "step": 1817 }, { "epoch": 0.24870041039671684, - "grad_norm": 1.3586534251992346, + "grad_norm": 1.342769432525293, "learning_rate": 8.549939209971222e-06, - "loss": 0.1666, + "loss": 0.1647, "step": 1818 }, { "epoch": 0.24883720930232558, - "grad_norm": 1.3224719315481734, + "grad_norm": 1.360835180882336, "learning_rate": 8.548425644617224e-06, - "loss": 0.2047, + "loss": 0.2084, "step": 1819 }, { "epoch": 0.24897400820793433, - "grad_norm": 1.369405708336576, + "grad_norm": 1.3494392640405224, "learning_rate": 8.546911423871895e-06, - "loss": 0.2202, + "loss": 0.2176, "step": 1820 }, { "epoch": 0.24911080711354308, - "grad_norm": 1.5935702017251479, + "grad_norm": 1.5447887071893307, "learning_rate": 8.545396548014906e-06, - "loss": 0.2259, + "loss": 0.2247, "step": 1821 }, { "epoch": 0.24924760601915186, - "grad_norm": 1.3274904611702036, + "grad_norm": 1.3173280206793396, "learning_rate": 8.543881017326057e-06, - "loss": 0.1979, + "loss": 0.1991, "step": 1822 }, { "epoch": 0.2493844049247606, - "grad_norm": 1.6318677043361225, + "grad_norm": 1.3950957362951972, "learning_rate": 8.542364832085263e-06, - "loss": 0.1945, + "loss": 0.1932, "step": 1823 }, { "epoch": 0.24952120383036935, - "grad_norm": 1.3066606031520356, + "grad_norm": 1.204672853549283, "learning_rate": 8.540847992572562e-06, - "loss": 0.1977, + "loss": 0.1944, "step": 1824 }, { "epoch": 0.2496580027359781, - "grad_norm": 1.2702726315129207, + "grad_norm": 1.2342666278449699, "learning_rate": 8.539330499068116e-06, - "loss": 0.1923, + "loss": 0.1903, "step": 1825 }, { "epoch": 0.24979480164158688, - "grad_norm": 1.5842951985971623, + "grad_norm": 1.56454031865287, "learning_rate": 8.537812351852202e-06, - "loss": 0.2329, + "loss": 0.2338, "step": 1826 }, { "epoch": 0.24993160054719563, - "grad_norm": 1.415395120924099, + "grad_norm": 1.3728635840785757, "learning_rate": 8.53629355120522e-06, - "loss": 0.2092, + "loss": 0.2049, "step": 1827 }, { "epoch": 0.2500683994528044, - "grad_norm": 1.4704129633667882, + "grad_norm": 1.4590253740396302, "learning_rate": 8.534774097407693e-06, - "loss": 0.1763, + "loss": 0.1774, "step": 1828 }, { "epoch": 0.25020519835841315, - "grad_norm": 1.3030086134322034, + "grad_norm": 1.2329306070107522, "learning_rate": 8.533253990740264e-06, - "loss": 0.2023, + "loss": 0.2019, "step": 1829 }, { "epoch": 0.2503419972640219, - "grad_norm": 1.3873171821826833, + "grad_norm": 1.383248014761093, "learning_rate": 8.531733231483695e-06, - "loss": 0.2075, + "loss": 0.2072, "step": 1830 }, { "epoch": 0.25047879616963065, - "grad_norm": 1.5525594557072573, + "grad_norm": 1.5223621076386695, "learning_rate": 8.530211819918865e-06, - "loss": 0.263, + "loss": 0.2611, "step": 1831 }, { "epoch": 0.2506155950752394, - "grad_norm": 1.3032104162590794, + "grad_norm": 1.281284549719799, "learning_rate": 8.528689756326783e-06, - "loss": 0.1934, + "loss": 0.1929, "step": 1832 }, { "epoch": 0.25075239398084814, - "grad_norm": 1.3768489263613557, + "grad_norm": 1.4066853845649918, "learning_rate": 8.527167040988566e-06, - "loss": 0.2538, + "loss": 0.2567, "step": 1833 }, { "epoch": 0.2508891928864569, - "grad_norm": 1.1325685129052048, + "grad_norm": 1.1282921747163308, "learning_rate": 8.525643674185467e-06, - "loss": 0.1968, + "loss": 0.1944, "step": 1834 }, { "epoch": 0.25102599179206564, - "grad_norm": 1.8890675480943975, + "grad_norm": 1.8638029463974313, "learning_rate": 8.524119656198846e-06, - "loss": 0.2763, + "loss": 0.2757, "step": 1835 }, { "epoch": 0.25116279069767444, - "grad_norm": 1.2187851972036896, + "grad_norm": 1.2112066204012428, "learning_rate": 8.522594987310184e-06, - "loss": 0.1866, + "loss": 0.1885, "step": 1836 }, { "epoch": 0.2512995896032832, - "grad_norm": 1.3748101636140497, + "grad_norm": 1.3921457112992124, "learning_rate": 8.521069667801095e-06, - "loss": 0.2207, + "loss": 0.2211, "step": 1837 }, { "epoch": 0.25143638850889194, - "grad_norm": 1.1620537570700846, + "grad_norm": 1.1887295235670017, "learning_rate": 8.519543697953297e-06, - "loss": 0.1682, + "loss": 0.1731, "step": 1838 }, { "epoch": 0.2515731874145007, - "grad_norm": 1.5647461562056721, + "grad_norm": 1.5475842868857517, "learning_rate": 8.518017078048638e-06, - "loss": 0.2497, + "loss": 0.2519, "step": 1839 }, { "epoch": 0.25170998632010944, - "grad_norm": 1.3529410247055909, + "grad_norm": 1.3503888501308157, "learning_rate": 8.516489808369084e-06, - "loss": 0.1991, + "loss": 0.1978, "step": 1840 }, { "epoch": 0.2518467852257182, - "grad_norm": 1.345572120597622, + "grad_norm": 1.3210044642695138, "learning_rate": 8.514961889196718e-06, - "loss": 0.2196, + "loss": 0.2173, "step": 1841 }, { "epoch": 0.25198358413132693, - "grad_norm": 1.60382778197095, + "grad_norm": 1.5949272955491989, "learning_rate": 8.513433320813752e-06, - "loss": 0.2088, + "loss": 0.2079, "step": 1842 }, { "epoch": 0.2521203830369357, - "grad_norm": 1.5276522749303791, + "grad_norm": 1.5002201505433792, "learning_rate": 8.511904103502501e-06, - "loss": 0.2223, + "loss": 0.2189, "step": 1843 }, { "epoch": 0.2522571819425445, - "grad_norm": 1.0707984312607057, + "grad_norm": 1.0543507142817512, "learning_rate": 8.510374237545418e-06, - "loss": 0.151, + "loss": 0.1508, "step": 1844 }, { "epoch": 0.25239398084815323, - "grad_norm": 1.2733131187925752, + "grad_norm": 1.2485752365646035, "learning_rate": 8.508843723225067e-06, - "loss": 0.2047, + "loss": 0.2021, "step": 1845 }, { "epoch": 0.252530779753762, - "grad_norm": 1.256526091440415, + "grad_norm": 1.1932283211627734, "learning_rate": 8.50731256082413e-06, "loss": 0.208, "step": 1846 }, { "epoch": 0.25266757865937073, - "grad_norm": 1.1358330808274717, + "grad_norm": 1.1481601190810822, "learning_rate": 8.505780750625413e-06, - "loss": 0.1814, + "loss": 0.1815, "step": 1847 }, { "epoch": 0.2528043775649795, - "grad_norm": 1.5098168191550105, + "grad_norm": 1.4769730773480116, "learning_rate": 8.50424829291184e-06, - "loss": 0.2261, + "loss": 0.2229, "step": 1848 }, { "epoch": 0.2529411764705882, - "grad_norm": 1.467281232968536, + "grad_norm": 1.4622374608015294, "learning_rate": 8.502715187966455e-06, - "loss": 0.196, + "loss": 0.1951, "step": 1849 }, { "epoch": 0.253077975376197, - "grad_norm": 1.4336772652858967, + "grad_norm": 1.4495238962732684, "learning_rate": 8.501181436072421e-06, - "loss": 0.2329, + "loss": 0.2358, "step": 1850 }, { "epoch": 0.2532147742818057, - "grad_norm": 1.251606777412045, + "grad_norm": 1.2378532930444035, "learning_rate": 8.499647037513022e-06, - "loss": 0.2137, + "loss": 0.215, "step": 1851 }, { "epoch": 0.2533515731874145, - "grad_norm": 1.2971517345631833, + "grad_norm": 1.2635847747613647, "learning_rate": 8.498111992571657e-06, - "loss": 0.1685, + "loss": 0.1664, "step": 1852 }, { "epoch": 0.2534883720930233, - "grad_norm": 1.4432421101321578, + "grad_norm": 1.4403664151566793, "learning_rate": 8.49657630153185e-06, - "loss": 0.1896, + "loss": 0.1914, "step": 1853 }, { "epoch": 0.253625170998632, - "grad_norm": 1.297367800340404, + "grad_norm": 1.300937220710698, "learning_rate": 8.495039964677242e-06, - "loss": 0.1861, + "loss": 0.1863, "step": 1854 }, { "epoch": 0.25376196990424077, - "grad_norm": 1.2508239071112792, + "grad_norm": 1.2314765852260012, "learning_rate": 8.493502982291593e-06, - "loss": 0.1952, + "loss": 0.1918, "step": 1855 }, { "epoch": 0.2538987688098495, - "grad_norm": 1.3410073134524543, + "grad_norm": 1.3133003905623828, "learning_rate": 8.49196535465878e-06, - "loss": 0.1865, + "loss": 0.187, "step": 1856 }, { "epoch": 0.25403556771545827, - "grad_norm": 1.2541158484987163, + "grad_norm": 1.228904461872831, "learning_rate": 8.490427082062805e-06, - "loss": 0.199, + "loss": 0.1996, "step": 1857 }, { "epoch": 0.254172366621067, - "grad_norm": 1.4994801140720135, + "grad_norm": 1.4660228278088574, "learning_rate": 8.488888164787783e-06, - "loss": 0.2108, + "loss": 0.2064, "step": 1858 }, { "epoch": 0.25430916552667576, - "grad_norm": 1.0812479120866596, + "grad_norm": 1.037390599552056, "learning_rate": 8.487348603117954e-06, - "loss": 0.198, + "loss": 0.1971, "step": 1859 }, { "epoch": 0.25444596443228457, - "grad_norm": 1.3100059054541047, + "grad_norm": 1.288796520868918, "learning_rate": 8.48580839733767e-06, - "loss": 0.1814, + "loss": 0.1836, "step": 1860 }, { "epoch": 0.2545827633378933, - "grad_norm": 1.5397568915510444, + "grad_norm": 1.5222591288092209, "learning_rate": 8.484267547731407e-06, "loss": 0.2495, "step": 1861 }, { "epoch": 0.25471956224350206, - "grad_norm": 1.542551996132586, + "grad_norm": 1.5148796657076995, "learning_rate": 8.48272605458376e-06, - "loss": 0.2068, + "loss": 0.2032, "step": 1862 }, { "epoch": 0.2548563611491108, - "grad_norm": 1.5369406046032046, + "grad_norm": 1.5390765655005667, "learning_rate": 8.48118391817944e-06, - "loss": 0.2383, + "loss": 0.2389, "step": 1863 }, { "epoch": 0.25499316005471956, - "grad_norm": 1.4551450488949809, + "grad_norm": 1.43659798343383, "learning_rate": 8.47964113880328e-06, - "loss": 0.2186, + "loss": 0.2176, "step": 1864 }, { "epoch": 0.2551299589603283, - "grad_norm": 1.443573325824817, + "grad_norm": 1.4383615121485607, "learning_rate": 8.47809771674023e-06, - "loss": 0.2112, + "loss": 0.2084, "step": 1865 }, { "epoch": 0.25526675786593706, - "grad_norm": 1.407555970606144, + "grad_norm": 1.4144402263252582, "learning_rate": 8.476553652275357e-06, - "loss": 0.243, + "loss": 0.2415, "step": 1866 }, { "epoch": 0.2554035567715458, - "grad_norm": 1.432847312483504, + "grad_norm": 1.4379221570309701, "learning_rate": 8.475008945693849e-06, - "loss": 0.1754, + "loss": 0.1734, "step": 1867 }, { "epoch": 0.2555403556771546, - "grad_norm": 1.3119876096974687, + "grad_norm": 1.303234021661508, "learning_rate": 8.47346359728101e-06, - "loss": 0.2161, + "loss": 0.2152, "step": 1868 }, { "epoch": 0.25567715458276336, - "grad_norm": 1.2707551736710696, + "grad_norm": 1.2847213640233233, "learning_rate": 8.471917607322272e-06, - "loss": 0.1854, + "loss": 0.1896, "step": 1869 }, { "epoch": 0.2558139534883721, - "grad_norm": 1.4189801342024388, + "grad_norm": 1.419265276266336, "learning_rate": 8.470370976103171e-06, - "loss": 0.2293, + "loss": 0.231, "step": 1870 }, { "epoch": 0.25595075239398085, - "grad_norm": 1.228076367810574, + "grad_norm": 1.2345652557975206, "learning_rate": 8.46882370390937e-06, - "loss": 0.1882, + "loss": 0.1897, "step": 1871 }, { "epoch": 0.2560875512995896, - "grad_norm": 1.4526757886990522, + "grad_norm": 1.4441102013795313, "learning_rate": 8.46727579102665e-06, - "loss": 0.2092, + "loss": 0.2098, "step": 1872 }, { "epoch": 0.25622435020519835, - "grad_norm": 1.1018977801767402, + "grad_norm": 1.1030801564902877, "learning_rate": 8.465727237740908e-06, - "loss": 0.2067, + "loss": 0.207, "step": 1873 }, { "epoch": 0.2563611491108071, - "grad_norm": 1.5480027367122993, + "grad_norm": 1.5448712478990605, "learning_rate": 8.464178044338162e-06, - "loss": 0.2492, + "loss": 0.2478, "step": 1874 }, { "epoch": 0.25649794801641584, - "grad_norm": 1.5591575291326776, + "grad_norm": 1.5345998858715804, "learning_rate": 8.462628211104547e-06, - "loss": 0.2538, + "loss": 0.2533, "step": 1875 }, { "epoch": 0.25663474692202465, - "grad_norm": 1.0394496085220424, + "grad_norm": 1.0444131899159141, "learning_rate": 8.461077738326312e-06, - "loss": 0.1449, + "loss": 0.1452, "step": 1876 }, { "epoch": 0.2567715458276334, - "grad_norm": 1.5307650703951408, + "grad_norm": 1.5288894944663984, "learning_rate": 8.459526626289836e-06, - "loss": 0.192, + "loss": 0.189, "step": 1877 }, { "epoch": 0.25690834473324214, - "grad_norm": 1.4223426429871122, + "grad_norm": 1.4242287473987767, "learning_rate": 8.457974875281601e-06, - "loss": 0.2033, + "loss": 0.2018, "step": 1878 }, { "epoch": 0.2570451436388509, - "grad_norm": 1.450057491502794, + "grad_norm": 1.5056845221126756, "learning_rate": 8.456422485588215e-06, - "loss": 0.2029, + "loss": 0.2053, "step": 1879 }, { "epoch": 0.25718194254445964, - "grad_norm": 1.3126960966901107, + "grad_norm": 1.3143506356090635, "learning_rate": 8.454869457496406e-06, - "loss": 0.2143, + "loss": 0.2134, "step": 1880 }, { "epoch": 0.2573187414500684, - "grad_norm": 1.2274480870800613, + "grad_norm": 1.2040243301655318, "learning_rate": 8.453315791293016e-06, - "loss": 0.185, + "loss": 0.1817, "step": 1881 }, { "epoch": 0.25745554035567714, - "grad_norm": 1.3791931467283844, + "grad_norm": 1.3608234192595894, "learning_rate": 8.451761487265003e-06, - "loss": 0.2006, + "loss": 0.2001, "step": 1882 }, { "epoch": 0.2575923392612859, - "grad_norm": 1.5785765268054621, + "grad_norm": 1.5257389872626843, "learning_rate": 8.45020654569945e-06, - "loss": 0.2121, + "loss": 0.2058, "step": 1883 }, { "epoch": 0.2577291381668947, - "grad_norm": 1.424664450402349, + "grad_norm": 1.398763265396118, "learning_rate": 8.448650966883551e-06, - "loss": 0.2174, + "loss": 0.2146, "step": 1884 }, { "epoch": 0.25786593707250344, - "grad_norm": 1.3038292802679963, + "grad_norm": 1.3023106205428272, "learning_rate": 8.44709475110462e-06, - "loss": 0.1927, + "loss": 0.1947, "step": 1885 }, { "epoch": 0.2580027359781122, - "grad_norm": 1.061954560162234, + "grad_norm": 1.0700759154253325, "learning_rate": 8.445537898650092e-06, - "loss": 0.1569, + "loss": 0.1579, "step": 1886 }, { "epoch": 0.25813953488372093, - "grad_norm": 1.3165382133407109, + "grad_norm": 1.3004484985461608, "learning_rate": 8.443980409807512e-06, - "loss": 0.2034, + "loss": 0.2042, "step": 1887 }, { "epoch": 0.2582763337893297, - "grad_norm": 1.088937496039494, + "grad_norm": 1.0760230621652287, "learning_rate": 8.442422284864549e-06, - "loss": 0.1961, + "loss": 0.1932, "step": 1888 }, { "epoch": 0.25841313269493843, - "grad_norm": 1.2581448501139572, + "grad_norm": 1.215078529743944, "learning_rate": 8.440863524108986e-06, - "loss": 0.2273, + "loss": 0.2233, "step": 1889 }, { "epoch": 0.2585499316005472, - "grad_norm": 1.497864257810404, + "grad_norm": 1.4733640683578424, "learning_rate": 8.439304127828729e-06, - "loss": 0.2542, + "loss": 0.2544, "step": 1890 }, { "epoch": 0.2586867305061559, - "grad_norm": 1.3360067312981214, + "grad_norm": 1.30830246930746, "learning_rate": 8.437744096311792e-06, - "loss": 0.1846, + "loss": 0.1857, "step": 1891 }, { "epoch": 0.25882352941176473, - "grad_norm": 1.5585047759723996, + "grad_norm": 1.5339743738796268, "learning_rate": 8.436183429846314e-06, - "loss": 0.2193, + "loss": 0.2168, "step": 1892 }, { "epoch": 0.2589603283173735, - "grad_norm": 1.5543934527011536, + "grad_norm": 1.530127488629341, "learning_rate": 8.434622128720548e-06, - "loss": 0.1929, + "loss": 0.1908, "step": 1893 }, { "epoch": 0.2590971272229822, - "grad_norm": 1.3542768028199679, + "grad_norm": 1.3547180065864721, "learning_rate": 8.433060193222868e-06, - "loss": 0.2219, + "loss": 0.2213, "step": 1894 }, { "epoch": 0.259233926128591, - "grad_norm": 1.2786358949370074, + "grad_norm": 1.2507717301130332, "learning_rate": 8.43149762364176e-06, - "loss": 0.2146, + "loss": 0.2147, "step": 1895 }, { "epoch": 0.2593707250341997, - "grad_norm": 1.7501730443702515, + "grad_norm": 1.7089388076710634, "learning_rate": 8.429934420265827e-06, - "loss": 0.2926, + "loss": 0.2876, "step": 1896 }, { "epoch": 0.25950752393980847, - "grad_norm": 1.3002977894446954, + "grad_norm": 1.302159771535511, "learning_rate": 8.428370583383795e-06, - "loss": 0.2489, + "loss": 0.249, "step": 1897 }, { "epoch": 0.2596443228454172, - "grad_norm": 1.209953025008365, + "grad_norm": 1.2169239064235216, "learning_rate": 8.426806113284502e-06, - "loss": 0.2157, + "loss": 0.2184, "step": 1898 }, { "epoch": 0.25978112175102597, - "grad_norm": 1.4061955649405948, + "grad_norm": 1.408455484976517, "learning_rate": 8.425241010256904e-06, - "loss": 0.2147, + "loss": 0.215, "step": 1899 }, { "epoch": 0.25991792065663477, - "grad_norm": 1.263294191522504, + "grad_norm": 1.2586616472991496, "learning_rate": 8.423675274590074e-06, - "loss": 0.2183, + "loss": 0.2157, "step": 1900 }, { "epoch": 0.25991792065663477, - "eval_loss": 0.20197449624538422, - "eval_runtime": 5.9407, - "eval_samples_per_second": 5.05, - "eval_steps_per_second": 1.347, + "eval_loss": 0.20215022563934326, + "eval_runtime": 5.9199, + "eval_samples_per_second": 5.068, + "eval_steps_per_second": 1.351, "step": 1900 }, { "epoch": 0.2600547195622435, - "grad_norm": 1.359832270596624, + "grad_norm": 1.341194683088519, "learning_rate": 8.422108906573203e-06, - "loss": 0.1656, + "loss": 0.1643, "step": 1901 }, { "epoch": 0.26019151846785227, - "grad_norm": 1.5471620760103113, + "grad_norm": 1.5590256897797385, "learning_rate": 8.4205419064956e-06, - "loss": 0.212, + "loss": 0.2116, "step": 1902 }, { "epoch": 0.260328317373461, - "grad_norm": 1.774127438414793, + "grad_norm": 1.745362175590622, "learning_rate": 8.418974274646683e-06, - "loss": 0.2513, + "loss": 0.2524, "step": 1903 }, { "epoch": 0.26046511627906976, - "grad_norm": 1.2396962342141749, + "grad_norm": 1.2339239347350468, "learning_rate": 8.417406011316e-06, - "loss": 0.2073, + "loss": 0.2059, "step": 1904 }, { "epoch": 0.2606019151846785, - "grad_norm": 1.4863009829527276, + "grad_norm": 1.4782103805444544, "learning_rate": 8.4158371167932e-06, - "loss": 0.2082, + "loss": 0.2083, "step": 1905 }, { "epoch": 0.26073871409028726, - "grad_norm": 1.041179143056358, + "grad_norm": 1.0289408299658807, "learning_rate": 8.41426759136806e-06, - "loss": 0.1711, + "loss": 0.1723, "step": 1906 }, { "epoch": 0.260875512995896, - "grad_norm": 1.175038133282494, + "grad_norm": 1.1857986268249519, "learning_rate": 8.41269743533047e-06, - "loss": 0.1977, + "loss": 0.1972, "step": 1907 }, { "epoch": 0.2610123119015048, - "grad_norm": 1.2435247071147746, + "grad_norm": 1.214914618669923, "learning_rate": 8.411126648970438e-06, - "loss": 0.1714, + "loss": 0.1689, "step": 1908 }, { "epoch": 0.26114911080711356, - "grad_norm": 1.2767489634173281, + "grad_norm": 1.239692974604197, "learning_rate": 8.409555232578087e-06, - "loss": 0.1843, + "loss": 0.1836, "step": 1909 }, { "epoch": 0.2612859097127223, - "grad_norm": 1.3714349338580742, + "grad_norm": 1.3362899891802495, "learning_rate": 8.407983186443653e-06, - "loss": 0.1859, + "loss": 0.1855, "step": 1910 }, { "epoch": 0.26142270861833106, - "grad_norm": 1.3180648457716013, + "grad_norm": 1.3117685249528652, "learning_rate": 8.406410510857494e-06, - "loss": 0.2005, + "loss": 0.2009, "step": 1911 }, { "epoch": 0.2615595075239398, - "grad_norm": 1.55257387004057, + "grad_norm": 1.5716347903972134, "learning_rate": 8.404837206110083e-06, - "loss": 0.2309, + "loss": 0.2333, "step": 1912 }, { "epoch": 0.26169630642954855, - "grad_norm": 1.4861404387839756, + "grad_norm": 1.480553630223553, "learning_rate": 8.403263272492005e-06, - "loss": 0.215, + "loss": 0.2144, "step": 1913 }, { "epoch": 0.2618331053351573, - "grad_norm": 1.4799633733412842, + "grad_norm": 1.4853261439204788, "learning_rate": 8.401688710293967e-06, - "loss": 0.1998, + "loss": 0.198, "step": 1914 }, { "epoch": 0.26196990424076605, - "grad_norm": 1.1881628456629507, + "grad_norm": 1.1631218979472924, "learning_rate": 8.40011351980679e-06, - "loss": 0.1802, + "loss": 0.1801, "step": 1915 }, { "epoch": 0.26210670314637485, - "grad_norm": 1.3508768834135445, + "grad_norm": 1.277300810918165, "learning_rate": 8.398537701321405e-06, - "loss": 0.226, + "loss": 0.2234, "step": 1916 }, { "epoch": 0.2622435020519836, - "grad_norm": 1.6737568490441308, + "grad_norm": 1.6336257912229284, "learning_rate": 8.396961255128871e-06, - "loss": 0.2293, + "loss": 0.2256, "step": 1917 }, { "epoch": 0.26238030095759235, - "grad_norm": 1.3783742828225376, + "grad_norm": 1.3846124212570214, "learning_rate": 8.395384181520352e-06, - "loss": 0.1752, + "loss": 0.1767, "step": 1918 }, { "epoch": 0.2625170998632011, - "grad_norm": 1.511295224353854, + "grad_norm": 1.4904423868330512, "learning_rate": 8.393806480787135e-06, - "loss": 0.2091, + "loss": 0.2068, "step": 1919 }, { "epoch": 0.26265389876880985, - "grad_norm": 1.508865927864752, + "grad_norm": 1.4730446544821971, "learning_rate": 8.392228153220617e-06, - "loss": 0.2414, + "loss": 0.2402, "step": 1920 }, { "epoch": 0.2627906976744186, - "grad_norm": 1.1558909237123667, + "grad_norm": 1.13717925343678, "learning_rate": 8.390649199112316e-06, - "loss": 0.1927, + "loss": 0.1937, "step": 1921 }, { "epoch": 0.26292749658002734, - "grad_norm": 1.2595262485204233, + "grad_norm": 1.203804248224599, "learning_rate": 8.389069618753864e-06, - "loss": 0.1911, + "loss": 0.1897, "step": 1922 }, { "epoch": 0.2630642954856361, - "grad_norm": 1.6681333326180672, + "grad_norm": 1.6372803538202811, "learning_rate": 8.387489412437007e-06, - "loss": 0.2395, + "loss": 0.2365, "step": 1923 }, { "epoch": 0.2632010943912449, - "grad_norm": 1.4101918393537, + "grad_norm": 1.3721248119051088, "learning_rate": 8.385908580453607e-06, "loss": 0.2042, "step": 1924 }, { "epoch": 0.26333789329685364, - "grad_norm": 1.460568769917279, + "grad_norm": 1.4454298072552396, "learning_rate": 8.384327123095645e-06, - "loss": 0.2378, + "loss": 0.2369, "step": 1925 }, { "epoch": 0.2634746922024624, - "grad_norm": 1.302400352465757, + "grad_norm": 1.2839092303229096, "learning_rate": 8.382745040655213e-06, - "loss": 0.1984, + "loss": 0.1993, "step": 1926 }, { "epoch": 0.26361149110807114, - "grad_norm": 1.0948434236167635, + "grad_norm": 1.1102371738668477, "learning_rate": 8.38116233342452e-06, - "loss": 0.1759, + "loss": 0.1805, "step": 1927 }, { "epoch": 0.2637482900136799, - "grad_norm": 1.1571398361701852, + "grad_norm": 1.1335667967416088, "learning_rate": 8.379579001695892e-06, - "loss": 0.1802, + "loss": 0.1789, "step": 1928 }, { "epoch": 0.26388508891928864, - "grad_norm": 1.5187749436613904, + "grad_norm": 1.5299605456992773, "learning_rate": 8.377995045761768e-06, - "loss": 0.213, + "loss": 0.2151, "step": 1929 }, { "epoch": 0.2640218878248974, - "grad_norm": 1.2548988183912846, + "grad_norm": 1.2422392114331648, "learning_rate": 8.376410465914705e-06, - "loss": 0.2046, + "loss": 0.2065, "step": 1930 }, { "epoch": 0.26415868673050613, - "grad_norm": 1.2387009374947144, + "grad_norm": 1.2214852162990315, "learning_rate": 8.374825262447372e-06, - "loss": 0.1806, + "loss": 0.1812, "step": 1931 }, { "epoch": 0.26429548563611494, - "grad_norm": 1.4434460420857056, + "grad_norm": 1.4485088625746378, "learning_rate": 8.373239435652555e-06, - "loss": 0.1896, + "loss": 0.1882, "step": 1932 }, { "epoch": 0.2644322845417237, - "grad_norm": 1.233857885259466, + "grad_norm": 1.2259606945582109, "learning_rate": 8.371652985823155e-06, - "loss": 0.1791, + "loss": 0.1811, "step": 1933 }, { "epoch": 0.26456908344733243, - "grad_norm": 1.6035676988618244, + "grad_norm": 1.5528126214814808, "learning_rate": 8.37006591325219e-06, - "loss": 0.2309, + "loss": 0.2302, "step": 1934 }, { "epoch": 0.2647058823529412, - "grad_norm": 1.6257691213333827, + "grad_norm": 1.5803299823508645, "learning_rate": 8.368478218232787e-06, - "loss": 0.2217, + "loss": 0.2224, "step": 1935 }, { "epoch": 0.26484268125854993, - "grad_norm": 1.4866481033441985, + "grad_norm": 1.4292643345269063, "learning_rate": 8.366889901058195e-06, - "loss": 0.2109, + "loss": 0.208, "step": 1936 }, { "epoch": 0.2649794801641587, - "grad_norm": 1.4249603150571142, + "grad_norm": 1.3887517285535877, "learning_rate": 8.365300962021774e-06, - "loss": 0.2393, + "loss": 0.2388, "step": 1937 }, { "epoch": 0.2651162790697674, - "grad_norm": 1.0158136723627513, + "grad_norm": 1.0143505977155727, "learning_rate": 8.363711401417e-06, - "loss": 0.1619, + "loss": 0.1622, "step": 1938 }, { "epoch": 0.2652530779753762, - "grad_norm": 1.273579241139966, + "grad_norm": 1.2699231717429877, "learning_rate": 8.362121219537465e-06, - "loss": 0.2448, + "loss": 0.2445, "step": 1939 }, { "epoch": 0.265389876880985, - "grad_norm": 1.308476185421833, + "grad_norm": 1.2920883112210804, "learning_rate": 8.36053041667687e-06, - "loss": 0.1916, + "loss": 0.1945, "step": 1940 }, { "epoch": 0.2655266757865937, - "grad_norm": 1.5282739882302112, + "grad_norm": 1.5065134935443945, "learning_rate": 8.35893899312904e-06, - "loss": 0.2474, + "loss": 0.2469, "step": 1941 }, { "epoch": 0.2656634746922025, - "grad_norm": 1.4096719314679371, + "grad_norm": 1.3903822248501208, "learning_rate": 8.357346949187905e-06, - "loss": 0.206, + "loss": 0.2075, "step": 1942 }, { "epoch": 0.2658002735978112, - "grad_norm": 1.3169729344608938, + "grad_norm": 1.3069913966884474, "learning_rate": 8.355754285147519e-06, - "loss": 0.217, + "loss": 0.2184, "step": 1943 }, { "epoch": 0.26593707250341997, - "grad_norm": 1.120812681583395, + "grad_norm": 1.1491341915070385, "learning_rate": 8.354161001302043e-06, - "loss": 0.1739, + "loss": 0.1755, "step": 1944 }, { "epoch": 0.2660738714090287, - "grad_norm": 1.574530359229433, + "grad_norm": 1.5240930563063713, "learning_rate": 8.352567097945754e-06, - "loss": 0.1867, + "loss": 0.1832, "step": 1945 }, { "epoch": 0.26621067031463747, - "grad_norm": 1.1681086846693225, + "grad_norm": 1.1517638177702214, "learning_rate": 8.350972575373047e-06, - "loss": 0.1859, + "loss": 0.1874, "step": 1946 }, { "epoch": 0.2663474692202462, - "grad_norm": 1.4019665611840817, + "grad_norm": 1.353673091514228, "learning_rate": 8.34937743387843e-06, - "loss": 0.2076, + "loss": 0.206, "step": 1947 }, { "epoch": 0.266484268125855, - "grad_norm": 1.3362727920344468, + "grad_norm": 1.3156077577512209, "learning_rate": 8.347781673756518e-06, - "loss": 0.2014, + "loss": 0.2006, "step": 1948 }, { "epoch": 0.26662106703146377, - "grad_norm": 1.3183405402511286, + "grad_norm": 1.298566693556152, "learning_rate": 8.346185295302056e-06, - "loss": 0.2018, + "loss": 0.201, "step": 1949 }, { "epoch": 0.2667578659370725, - "grad_norm": 1.3391261385233224, + "grad_norm": 1.3151756336921068, "learning_rate": 8.344588298809887e-06, - "loss": 0.1975, + "loss": 0.1957, "step": 1950 }, { "epoch": 0.26689466484268126, - "grad_norm": 1.3403666560142515, + "grad_norm": 1.3056525109243742, "learning_rate": 8.342990684574976e-06, - "loss": 0.2193, + "loss": 0.2135, "step": 1951 }, { "epoch": 0.26703146374829, - "grad_norm": 1.534160174615575, + "grad_norm": 1.515300709747883, "learning_rate": 8.341392452892404e-06, - "loss": 0.2484, + "loss": 0.2494, "step": 1952 }, { "epoch": 0.26716826265389876, - "grad_norm": 1.4098959902428798, + "grad_norm": 1.3834963534637674, "learning_rate": 8.33979360405736e-06, - "loss": 0.2134, + "loss": 0.2129, "step": 1953 }, { "epoch": 0.2673050615595075, - "grad_norm": 1.4092779595182185, + "grad_norm": 1.3791190540196894, "learning_rate": 8.338194138365152e-06, - "loss": 0.2516, + "loss": 0.2521, "step": 1954 }, { "epoch": 0.26744186046511625, - "grad_norm": 1.479925817822205, + "grad_norm": 1.4997280799672437, "learning_rate": 8.336594056111197e-06, - "loss": 0.1921, + "loss": 0.1955, "step": 1955 }, { "epoch": 0.26757865937072506, - "grad_norm": 1.4017489018399198, + "grad_norm": 1.4288316060148785, "learning_rate": 8.334993357591032e-06, - "loss": 0.2108, + "loss": 0.2123, "step": 1956 }, { "epoch": 0.2677154582763338, - "grad_norm": 1.0756674967686963, + "grad_norm": 1.057233662168226, "learning_rate": 8.333392043100306e-06, - "loss": 0.1789, + "loss": 0.1786, "step": 1957 }, { "epoch": 0.26785225718194255, - "grad_norm": 1.2866258929831043, + "grad_norm": 1.2643372967958653, "learning_rate": 8.331790112934777e-06, - "loss": 0.2029, + "loss": 0.2014, "step": 1958 }, { "epoch": 0.2679890560875513, - "grad_norm": 1.1019692333235025, + "grad_norm": 1.0896231617231573, "learning_rate": 8.33018756739032e-06, - "loss": 0.1796, + "loss": 0.1793, "step": 1959 }, { "epoch": 0.26812585499316005, - "grad_norm": 1.3405320388735182, + "grad_norm": 1.2918375270347764, "learning_rate": 8.328584406762927e-06, - "loss": 0.2137, + "loss": 0.2128, "step": 1960 }, { "epoch": 0.2682626538987688, - "grad_norm": 1.296147353036529, + "grad_norm": 1.293492345627052, "learning_rate": 8.326980631348698e-06, - "loss": 0.1858, + "loss": 0.1868, "step": 1961 }, { "epoch": 0.26839945280437755, - "grad_norm": 1.1458062518217302, + "grad_norm": 1.130072831499899, "learning_rate": 8.325376241443849e-06, - "loss": 0.165, + "loss": 0.1667, "step": 1962 }, { "epoch": 0.2685362517099863, - "grad_norm": 1.2513933715949044, + "grad_norm": 1.2206876606088508, "learning_rate": 8.323771237344714e-06, - "loss": 0.1989, + "loss": 0.1981, "step": 1963 }, { "epoch": 0.2686730506155951, - "grad_norm": 1.6927210205956487, + "grad_norm": 1.6370030794467492, "learning_rate": 8.322165619347727e-06, - "loss": 0.2795, + "loss": 0.2789, "step": 1964 }, { "epoch": 0.26880984952120385, - "grad_norm": 1.2926395976756677, + "grad_norm": 1.2841278165714796, "learning_rate": 8.320559387749453e-06, - "loss": 0.1714, + "loss": 0.1699, "step": 1965 }, { "epoch": 0.2689466484268126, - "grad_norm": 1.3758861615295812, + "grad_norm": 1.3676932971908267, "learning_rate": 8.318952542846556e-06, - "loss": 0.2177, + "loss": 0.2178, "step": 1966 }, { "epoch": 0.26908344733242134, - "grad_norm": 1.1843260078841236, + "grad_norm": 1.1734820321584454, "learning_rate": 8.317345084935824e-06, - "loss": 0.1531, + "loss": 0.1537, "step": 1967 }, { "epoch": 0.2692202462380301, - "grad_norm": 1.6331333905537635, + "grad_norm": 1.608024832938122, "learning_rate": 8.315737014314149e-06, - "loss": 0.2255, + "loss": 0.2278, "step": 1968 }, { "epoch": 0.26935704514363884, - "grad_norm": 1.2669049253062226, + "grad_norm": 1.2647083887320716, "learning_rate": 8.314128331278542e-06, - "loss": 0.2024, + "loss": 0.2044, "step": 1969 }, { "epoch": 0.2694938440492476, - "grad_norm": 1.3778372416729938, + "grad_norm": 1.3418409186312659, "learning_rate": 8.312519036126125e-06, - "loss": 0.1868, + "loss": 0.1857, "step": 1970 }, { "epoch": 0.26963064295485634, - "grad_norm": 1.3109005538591745, + "grad_norm": 1.2824800123011288, "learning_rate": 8.310909129154134e-06, - "loss": 0.2285, + "loss": 0.2264, "step": 1971 }, { "epoch": 0.26976744186046514, - "grad_norm": 1.5658569399510343, + "grad_norm": 1.5177451090804883, "learning_rate": 8.309298610659917e-06, - "loss": 0.1976, + "loss": 0.1956, "step": 1972 }, { "epoch": 0.2699042407660739, - "grad_norm": 1.609740836447263, + "grad_norm": 1.5894858667175014, "learning_rate": 8.307687480940936e-06, - "loss": 0.2728, + "loss": 0.2709, "step": 1973 }, { "epoch": 0.27004103967168264, - "grad_norm": 1.5111111582075933, + "grad_norm": 1.4909250610646307, "learning_rate": 8.306075740294763e-06, - "loss": 0.2362, + "loss": 0.2349, "step": 1974 }, { "epoch": 0.2701778385772914, - "grad_norm": 1.7747932823672359, + "grad_norm": 1.712996462404919, "learning_rate": 8.30446338901909e-06, - "loss": 0.2542, + "loss": 0.2517, "step": 1975 }, { "epoch": 0.27031463748290013, - "grad_norm": 1.355607742616268, + "grad_norm": 1.3345911954041503, "learning_rate": 8.302850427411714e-06, - "loss": 0.2212, + "loss": 0.2214, "step": 1976 }, { "epoch": 0.2704514363885089, - "grad_norm": 1.233279034164818, + "grad_norm": 1.1781816326231367, "learning_rate": 8.301236855770546e-06, - "loss": 0.1983, + "loss": 0.1952, "step": 1977 }, { "epoch": 0.27058823529411763, - "grad_norm": 1.5584871152434026, + "grad_norm": 1.5094695179015158, "learning_rate": 8.299622674393615e-06, - "loss": 0.2394, + "loss": 0.2373, "step": 1978 }, { "epoch": 0.2707250341997264, - "grad_norm": 1.3979569937485157, + "grad_norm": 1.3765667189080972, "learning_rate": 8.298007883579056e-06, - "loss": 0.2692, + "loss": 0.2675, "step": 1979 }, { "epoch": 0.2708618331053352, - "grad_norm": 1.343894223014492, + "grad_norm": 1.3518008412111358, "learning_rate": 8.29639248362512e-06, - "loss": 0.2092, + "loss": 0.2081, "step": 1980 }, { "epoch": 0.27099863201094393, - "grad_norm": 1.2329844875958234, + "grad_norm": 1.2153101242994493, "learning_rate": 8.294776474830172e-06, - "loss": 0.2062, + "loss": 0.206, "step": 1981 }, { "epoch": 0.2711354309165527, - "grad_norm": 1.552922300938389, + "grad_norm": 1.5149361178590721, "learning_rate": 8.293159857492686e-06, - "loss": 0.2122, + "loss": 0.2113, "step": 1982 }, { "epoch": 0.2712722298221614, - "grad_norm": 1.1291933026916008, + "grad_norm": 1.1160166387420358, "learning_rate": 8.291542631911251e-06, - "loss": 0.1771, + "loss": 0.1763, "step": 1983 }, { "epoch": 0.2714090287277702, - "grad_norm": 1.4280503422666246, + "grad_norm": 1.3977917030729774, "learning_rate": 8.289924798384566e-06, - "loss": 0.2208, + "loss": 0.2201, "step": 1984 }, { "epoch": 0.2715458276333789, - "grad_norm": 1.4838433227602923, + "grad_norm": 1.5007388396884864, "learning_rate": 8.288306357211444e-06, - "loss": 0.2488, + "loss": 0.249, "step": 1985 }, { "epoch": 0.27168262653898767, - "grad_norm": 1.2452318511446856, + "grad_norm": 1.2320457711852524, "learning_rate": 8.286687308690811e-06, - "loss": 0.18, + "loss": 0.1782, "step": 1986 }, { "epoch": 0.2718194254445964, - "grad_norm": 1.4587216088695791, + "grad_norm": 1.4634771341761983, "learning_rate": 8.2850676531217e-06, "loss": 0.1814, "step": 1987 }, { "epoch": 0.2719562243502052, - "grad_norm": 1.4017598600634358, + "grad_norm": 1.356006908949737, "learning_rate": 8.283447390803265e-06, - "loss": 0.2188, + "loss": 0.2183, "step": 1988 }, { "epoch": 0.27209302325581397, - "grad_norm": 1.44071761211971, + "grad_norm": 1.4225453276423867, "learning_rate": 8.281826522034764e-06, - "loss": 0.1895, + "loss": 0.1898, "step": 1989 }, { "epoch": 0.2722298221614227, - "grad_norm": 1.4798106832579145, + "grad_norm": 1.4521337484603172, "learning_rate": 8.280205047115572e-06, - "loss": 0.2345, + "loss": 0.2323, "step": 1990 }, { "epoch": 0.27236662106703147, - "grad_norm": 1.487207887458284, + "grad_norm": 1.442878739433249, "learning_rate": 8.278582966345173e-06, - "loss": 0.2174, + "loss": 0.2141, "step": 1991 }, { "epoch": 0.2725034199726402, - "grad_norm": 1.2680348701806732, + "grad_norm": 1.2312660538991205, "learning_rate": 8.276960280023164e-06, - "loss": 0.1846, + "loss": 0.1844, "step": 1992 }, { "epoch": 0.27264021887824896, - "grad_norm": 1.4682342618189537, + "grad_norm": 1.4008771702876068, "learning_rate": 8.275336988449254e-06, - "loss": 0.1981, + "loss": 0.1973, "step": 1993 }, { "epoch": 0.2727770177838577, - "grad_norm": 1.2726265127832224, + "grad_norm": 1.2459671088799158, "learning_rate": 8.273713091923264e-06, - "loss": 0.2067, + "loss": 0.2077, "step": 1994 }, { "epoch": 0.27291381668946646, - "grad_norm": 1.4232762698108172, + "grad_norm": 1.3885525396413085, "learning_rate": 8.272088590745127e-06, - "loss": 0.2035, + "loss": 0.2024, "step": 1995 }, { "epoch": 0.27305061559507526, - "grad_norm": 1.4359961213625214, + "grad_norm": 1.4061705626987833, "learning_rate": 8.270463485214884e-06, - "loss": 0.2364, + "loss": 0.2354, "step": 1996 }, { "epoch": 0.273187414500684, - "grad_norm": 1.800458460527739, + "grad_norm": 1.7555839445758359, "learning_rate": 8.268837775632694e-06, - "loss": 0.2825, + "loss": 0.2819, "step": 1997 }, { "epoch": 0.27332421340629276, - "grad_norm": 1.2952820673394156, + "grad_norm": 1.2775496045067272, "learning_rate": 8.267211462298824e-06, - "loss": 0.1915, + "loss": 0.1902, "step": 1998 }, { "epoch": 0.2734610123119015, - "grad_norm": 1.5205475902119658, + "grad_norm": 1.4940646468583834, "learning_rate": 8.265584545513649e-06, - "loss": 0.2371, + "loss": 0.2362, "step": 1999 }, { "epoch": 0.27359781121751026, - "grad_norm": 1.335212914220311, + "grad_norm": 1.3168673618365165, "learning_rate": 8.263957025577664e-06, - "loss": 0.2139, + "loss": 0.2117, "step": 2000 }, { "epoch": 0.27359781121751026, - "eval_loss": 0.2043255865573883, - "eval_runtime": 5.9036, - "eval_samples_per_second": 5.082, - "eval_steps_per_second": 1.355, + "eval_loss": 0.204628586769104, + "eval_runtime": 5.9176, + "eval_samples_per_second": 5.07, + "eval_steps_per_second": 1.352, "step": 2000 }, { "epoch": 0.273734610123119, - "grad_norm": 1.4453483979719126, + "grad_norm": 1.437959038705203, "learning_rate": 8.262328902791468e-06, - "loss": 0.2261, + "loss": 0.2281, "step": 2001 }, { "epoch": 0.27387140902872775, - "grad_norm": 1.3207918453854053, + "grad_norm": 1.291702036353176, "learning_rate": 8.260700177455775e-06, - "loss": 0.2326, + "loss": 0.2272, "step": 2002 }, { "epoch": 0.2740082079343365, - "grad_norm": 1.3821141020704921, + "grad_norm": 1.3498386466117243, "learning_rate": 8.259070849871407e-06, - "loss": 0.2342, + "loss": 0.233, "step": 2003 }, { "epoch": 0.2741450068399453, - "grad_norm": 1.517363680935228, + "grad_norm": 1.4788317354603284, "learning_rate": 8.257440920339303e-06, - "loss": 0.2116, + "loss": 0.213, "step": 2004 }, { "epoch": 0.27428180574555405, - "grad_norm": 1.3105528448253647, + "grad_norm": 1.270984606103494, "learning_rate": 8.255810389160508e-06, - "loss": 0.1799, + "loss": 0.1797, "step": 2005 }, { "epoch": 0.2744186046511628, - "grad_norm": 1.019026334376179, + "grad_norm": 1.0086797128599254, "learning_rate": 8.25417925663618e-06, - "loss": 0.1564, + "loss": 0.156, "step": 2006 }, { "epoch": 0.27455540355677155, - "grad_norm": 1.332471733153966, + "grad_norm": 1.3168715069921708, "learning_rate": 8.252547523067586e-06, - "loss": 0.1857, + "loss": 0.1862, "step": 2007 }, { "epoch": 0.2746922024623803, - "grad_norm": 1.4431232541859298, + "grad_norm": 1.4374975462950748, "learning_rate": 8.250915188756107e-06, - "loss": 0.2307, + "loss": 0.23, "step": 2008 }, { "epoch": 0.27482900136798905, - "grad_norm": 1.2426970987274373, + "grad_norm": 1.2343083171778193, "learning_rate": 8.249282254003238e-06, - "loss": 0.2219, + "loss": 0.2235, "step": 2009 }, { "epoch": 0.2749658002735978, - "grad_norm": 1.4179710423304839, + "grad_norm": 1.405365370426594, "learning_rate": 8.247648719110572e-06, - "loss": 0.2064, + "loss": 0.2065, "step": 2010 }, { "epoch": 0.27510259917920654, - "grad_norm": 1.3909113353496556, + "grad_norm": 1.3899718470607771, "learning_rate": 8.246014584379831e-06, - "loss": 0.1754, + "loss": 0.1778, "step": 2011 }, { "epoch": 0.27523939808481535, - "grad_norm": 1.2848963562133584, + "grad_norm": 1.272347048602154, "learning_rate": 8.244379850112831e-06, - "loss": 0.2029, + "loss": 0.2003, "step": 2012 }, { "epoch": 0.2753761969904241, - "grad_norm": 1.4375928411410326, + "grad_norm": 1.4232495841865507, "learning_rate": 8.242744516611509e-06, - "loss": 0.2273, + "loss": 0.2291, "step": 2013 }, { "epoch": 0.27551299589603284, - "grad_norm": 1.5822793484999138, + "grad_norm": 1.543443463511994, "learning_rate": 8.241108584177912e-06, - "loss": 0.2243, + "loss": 0.2231, "step": 2014 }, { "epoch": 0.2756497948016416, - "grad_norm": 1.2530007876786944, + "grad_norm": 1.2487524487094428, "learning_rate": 8.239472053114192e-06, - "loss": 0.2032, + "loss": 0.2034, "step": 2015 }, { "epoch": 0.27578659370725034, - "grad_norm": 1.3004582113871073, + "grad_norm": 1.3024920521459549, "learning_rate": 8.237834923722615e-06, - "loss": 0.2072, + "loss": 0.2079, "step": 2016 }, { "epoch": 0.2759233926128591, - "grad_norm": 1.1698426226979273, + "grad_norm": 1.154574319394477, "learning_rate": 8.23619719630556e-06, - "loss": 0.1913, + "loss": 0.1914, "step": 2017 }, { "epoch": 0.27606019151846783, - "grad_norm": 1.6068001089683868, + "grad_norm": 1.575433935363453, "learning_rate": 8.234558871165511e-06, - "loss": 0.2287, + "loss": 0.2273, "step": 2018 }, { "epoch": 0.2761969904240766, - "grad_norm": 1.4453361168627152, + "grad_norm": 1.407309340634788, "learning_rate": 8.23291994860507e-06, - "loss": 0.2074, + "loss": 0.2041, "step": 2019 }, { "epoch": 0.2763337893296854, - "grad_norm": 1.1012605975838625, + "grad_norm": 1.0816229239705735, "learning_rate": 8.231280428926938e-06, - "loss": 0.1716, + "loss": 0.1739, "step": 2020 }, { "epoch": 0.27647058823529413, - "grad_norm": 1.4131428242782202, + "grad_norm": 1.3956642248109818, "learning_rate": 8.229640312433938e-06, - "loss": 0.2046, + "loss": 0.2053, "step": 2021 }, { "epoch": 0.2766073871409029, - "grad_norm": 1.3115249853943065, + "grad_norm": 1.3065289088871503, "learning_rate": 8.227999599428995e-06, - "loss": 0.2031, + "loss": 0.2039, "step": 2022 }, { "epoch": 0.27674418604651163, - "grad_norm": 1.2974579405982056, + "grad_norm": 1.2834881552490105, "learning_rate": 8.226358290215151e-06, - "loss": 0.2195, + "loss": 0.2224, "step": 2023 }, { "epoch": 0.2768809849521204, - "grad_norm": 1.6002392710220767, + "grad_norm": 1.5686633780496875, "learning_rate": 8.224716385095552e-06, - "loss": 0.2272, + "loss": 0.2291, "step": 2024 }, { "epoch": 0.2770177838577291, - "grad_norm": 1.2620881300339617, + "grad_norm": 1.2371724250180116, "learning_rate": 8.223073884373457e-06, - "loss": 0.1859, + "loss": 0.1853, "step": 2025 }, { "epoch": 0.2771545827633379, - "grad_norm": 1.486524391549536, + "grad_norm": 1.4677440709547571, "learning_rate": 8.221430788352234e-06, - "loss": 0.2508, + "loss": 0.2523, "step": 2026 }, { "epoch": 0.2772913816689466, - "grad_norm": 1.4323935830257526, + "grad_norm": 1.3860748010665191, "learning_rate": 8.219787097335363e-06, - "loss": 0.2115, + "loss": 0.2083, "step": 2027 }, { "epoch": 0.2774281805745554, - "grad_norm": 1.2337287290383394, + "grad_norm": 1.2319443161058323, "learning_rate": 8.218142811626431e-06, - "loss": 0.1968, + "loss": 0.1925, "step": 2028 }, { "epoch": 0.2775649794801642, - "grad_norm": 1.1933856175724933, + "grad_norm": 1.1693887385620496, "learning_rate": 8.216497931529134e-06, - "loss": 0.1689, + "loss": 0.1691, "step": 2029 }, { "epoch": 0.2777017783857729, - "grad_norm": 1.2164032262552347, + "grad_norm": 1.2110521630124313, "learning_rate": 8.214852457347287e-06, - "loss": 0.2061, + "loss": 0.2069, "step": 2030 }, { "epoch": 0.27783857729138167, - "grad_norm": 1.331153619625782, + "grad_norm": 1.324094798115972, "learning_rate": 8.213206389384801e-06, - "loss": 0.194, + "loss": 0.1936, "step": 2031 }, { "epoch": 0.2779753761969904, - "grad_norm": 1.2155292856192201, + "grad_norm": 1.2022062546154062, "learning_rate": 8.211559727945706e-06, - "loss": 0.1861, + "loss": 0.1871, "step": 2032 }, { "epoch": 0.27811217510259917, - "grad_norm": 1.4113916824641874, + "grad_norm": 1.4108783263281341, "learning_rate": 8.20991247333414e-06, - "loss": 0.2561, + "loss": 0.256, "step": 2033 }, { "epoch": 0.2782489740082079, - "grad_norm": 1.5635689890570157, + "grad_norm": 1.547333144575377, "learning_rate": 8.208264625854348e-06, - "loss": 0.2058, + "loss": 0.206, "step": 2034 }, { "epoch": 0.27838577291381666, - "grad_norm": 1.3745746549867812, + "grad_norm": 1.360183150133515, "learning_rate": 8.206616185810686e-06, - "loss": 0.2124, + "loss": 0.2125, "step": 2035 }, { "epoch": 0.27852257181942547, - "grad_norm": 1.1346944495239442, + "grad_norm": 1.0292810599420474, "learning_rate": 8.204967153507618e-06, - "loss": 0.1624, + "loss": 0.1643, "step": 2036 }, { "epoch": 0.2786593707250342, - "grad_norm": 1.4619566216030135, + "grad_norm": 1.4395133359137717, "learning_rate": 8.203317529249726e-06, - "loss": 0.2038, + "loss": 0.2043, "step": 2037 }, { "epoch": 0.27879616963064296, - "grad_norm": 1.6177022639394363, + "grad_norm": 1.589122100180029, "learning_rate": 8.201667313341686e-06, - "loss": 0.2243, + "loss": 0.2235, "step": 2038 }, { "epoch": 0.2789329685362517, - "grad_norm": 0.9181579967866982, + "grad_norm": 0.9046012114493328, "learning_rate": 8.200016506088295e-06, - "loss": 0.1674, + "loss": 0.1691, "step": 2039 }, { "epoch": 0.27906976744186046, - "grad_norm": 1.3314348800895983, + "grad_norm": 1.3339472582377727, "learning_rate": 8.198365107794457e-06, - "loss": 0.2021, + "loss": 0.2033, "step": 2040 }, { "epoch": 0.2792065663474692, - "grad_norm": 1.338646893918052, + "grad_norm": 1.3447132318622288, "learning_rate": 8.196713118765183e-06, - "loss": 0.2012, + "loss": 0.2027, "step": 2041 }, { "epoch": 0.27934336525307796, - "grad_norm": 1.2835659373250414, + "grad_norm": 1.2812589061594453, "learning_rate": 8.19506053930559e-06, - "loss": 0.2308, + "loss": 0.2314, "step": 2042 }, { "epoch": 0.2794801641586867, - "grad_norm": 1.3250181101922012, + "grad_norm": 1.290874993848971, "learning_rate": 8.193407369720914e-06, - "loss": 0.2062, + "loss": 0.205, "step": 2043 }, { "epoch": 0.2796169630642955, - "grad_norm": 1.6684119291790338, + "grad_norm": 1.6482345930207951, "learning_rate": 8.191753610316491e-06, - "loss": 0.237, + "loss": 0.2354, "step": 2044 }, { "epoch": 0.27975376196990426, - "grad_norm": 1.624528734618821, + "grad_norm": 1.3429032634502276, "learning_rate": 8.190099261397771e-06, - "loss": 0.2483, + "loss": 0.2466, "step": 2045 }, { "epoch": 0.279890560875513, - "grad_norm": 1.6155187752268656, + "grad_norm": 1.6151401770989007, "learning_rate": 8.188444323270309e-06, - "loss": 0.2257, + "loss": 0.229, "step": 2046 }, { "epoch": 0.28002735978112175, - "grad_norm": 1.3619534570747684, + "grad_norm": 1.3525751624686018, "learning_rate": 8.186788796239771e-06, - "loss": 0.1839, + "loss": 0.1846, "step": 2047 }, { "epoch": 0.2801641586867305, - "grad_norm": 1.523528695266824, + "grad_norm": 1.468172816780241, "learning_rate": 8.185132680611932e-06, - "loss": 0.2537, + "loss": 0.2524, "step": 2048 }, { "epoch": 0.28030095759233925, - "grad_norm": 1.2715920620401167, + "grad_norm": 1.2603823137346735, "learning_rate": 8.183475976692677e-06, - "loss": 0.2177, + "loss": 0.2182, "step": 2049 }, { "epoch": 0.280437756497948, - "grad_norm": 1.3476982264711508, + "grad_norm": 1.3320617849671637, "learning_rate": 8.181818684787992e-06, - "loss": 0.2169, + "loss": 0.2157, "step": 2050 }, { "epoch": 0.28057455540355675, - "grad_norm": 1.4732511569394335, + "grad_norm": 1.4549049748917353, "learning_rate": 8.180160805203984e-06, - "loss": 0.2046, + "loss": 0.2006, "step": 2051 }, { "epoch": 0.28071135430916555, - "grad_norm": 1.3051932038127743, + "grad_norm": 1.309864166269675, "learning_rate": 8.178502338246858e-06, - "loss": 0.2226, + "loss": 0.2213, "step": 2052 }, { "epoch": 0.2808481532147743, - "grad_norm": 1.4039963257239252, + "grad_norm": 1.3659287959743511, "learning_rate": 8.176843284222934e-06, "loss": 0.1922, "step": 2053 }, { "epoch": 0.28098495212038305, - "grad_norm": 1.4212665272593081, + "grad_norm": 1.4234109534566668, "learning_rate": 8.175183643438635e-06, - "loss": 0.2093, + "loss": 0.2106, "step": 2054 }, { "epoch": 0.2811217510259918, - "grad_norm": 1.2015241761155429, + "grad_norm": 1.1781362266304498, "learning_rate": 8.1735234162005e-06, - "loss": 0.1677, + "loss": 0.1671, "step": 2055 }, { "epoch": 0.28125854993160054, - "grad_norm": 1.2589153022845212, + "grad_norm": 1.2587157111625635, "learning_rate": 8.171862602815168e-06, - "loss": 0.213, + "loss": 0.211, "step": 2056 }, { "epoch": 0.2813953488372093, - "grad_norm": 1.31200331706141, + "grad_norm": 1.2848256695063918, "learning_rate": 8.17020120358939e-06, - "loss": 0.2031, + "loss": 0.2014, "step": 2057 }, { "epoch": 0.28153214774281804, - "grad_norm": 1.3236502542586763, + "grad_norm": 1.3198920757251442, "learning_rate": 8.168539218830025e-06, - "loss": 0.2178, + "loss": 0.2216, "step": 2058 }, { "epoch": 0.2816689466484268, - "grad_norm": 1.36365797909258, + "grad_norm": 1.3437508632252846, "learning_rate": 8.166876648844042e-06, - "loss": 0.2099, + "loss": 0.2093, "step": 2059 }, { "epoch": 0.2818057455540356, - "grad_norm": 1.369916650544161, + "grad_norm": 1.3518201391544478, "learning_rate": 8.165213493938515e-06, - "loss": 0.2012, + "loss": 0.2029, "step": 2060 }, { "epoch": 0.28194254445964434, - "grad_norm": 1.312183463982876, + "grad_norm": 1.3031956535000844, "learning_rate": 8.163549754420628e-06, - "loss": 0.228, + "loss": 0.2266, "step": 2061 }, { "epoch": 0.2820793433652531, - "grad_norm": 1.335585263707098, + "grad_norm": 1.3348994544957606, "learning_rate": 8.16188543059767e-06, - "loss": 0.1889, + "loss": 0.1907, "step": 2062 }, { "epoch": 0.28221614227086184, - "grad_norm": 1.4752838088749674, + "grad_norm": 1.4440073382405396, "learning_rate": 8.160220522777044e-06, - "loss": 0.1776, + "loss": 0.1769, "step": 2063 }, { "epoch": 0.2823529411764706, - "grad_norm": 1.1449670050923109, + "grad_norm": 1.1262275823959167, "learning_rate": 8.158555031266255e-06, - "loss": 0.1816, + "loss": 0.1811, "step": 2064 }, { "epoch": 0.28248974008207933, - "grad_norm": 1.2035599150127447, + "grad_norm": 1.1840994768219288, "learning_rate": 8.156888956372919e-06, - "loss": 0.1421, + "loss": 0.1406, "step": 2065 }, { "epoch": 0.2826265389876881, - "grad_norm": 1.0570779015431713, + "grad_norm": 1.129666764859428, "learning_rate": 8.155222298404757e-06, - "loss": 0.2042, + "loss": 0.2064, "step": 2066 }, { "epoch": 0.28276333789329683, - "grad_norm": 1.3885235332224966, + "grad_norm": 1.3260936852470977, "learning_rate": 8.153555057669602e-06, - "loss": 0.198, + "loss": 0.1953, "step": 2067 }, { "epoch": 0.28290013679890563, - "grad_norm": 1.446437437637764, + "grad_norm": 1.4145885458849914, "learning_rate": 8.151887234475388e-06, - "loss": 0.2303, + "loss": 0.2296, "step": 2068 }, { "epoch": 0.2830369357045144, - "grad_norm": 1.391489103384705, + "grad_norm": 1.3759543606645723, "learning_rate": 8.150218829130163e-06, - "loss": 0.2243, + "loss": 0.2231, "step": 2069 }, { "epoch": 0.28317373461012313, - "grad_norm": 1.382724236064367, + "grad_norm": 1.3444282577141726, "learning_rate": 8.148549841942082e-06, - "loss": 0.2234, + "loss": 0.2207, "step": 2070 }, { "epoch": 0.2833105335157319, - "grad_norm": 1.4912078890387424, + "grad_norm": 1.4277668580306797, "learning_rate": 8.146880273219403e-06, - "loss": 0.231, + "loss": 0.2268, "step": 2071 }, { "epoch": 0.2834473324213406, - "grad_norm": 1.40470859800406, + "grad_norm": 1.382215537624884, "learning_rate": 8.145210123270496e-06, - "loss": 0.2449, + "loss": 0.2442, "step": 2072 }, { "epoch": 0.2835841313269494, - "grad_norm": 1.2916368356834564, + "grad_norm": 1.3585175606747444, "learning_rate": 8.143539392403831e-06, - "loss": 0.2376, + "loss": 0.2425, "step": 2073 }, { "epoch": 0.2837209302325581, - "grad_norm": 1.100769401907748, + "grad_norm": 1.102019708924919, "learning_rate": 8.141868080927998e-06, - "loss": 0.1883, + "loss": 0.1876, "step": 2074 }, { "epoch": 0.28385772913816687, - "grad_norm": 1.1446789485618922, + "grad_norm": 1.1314199252675907, "learning_rate": 8.14019618915168e-06, - "loss": 0.1942, + "loss": 0.1926, "step": 2075 }, { "epoch": 0.2839945280437757, - "grad_norm": 1.5341231025655655, + "grad_norm": 1.4562495629179177, "learning_rate": 8.138523717383682e-06, - "loss": 0.2198, + "loss": 0.2154, "step": 2076 }, { "epoch": 0.2841313269493844, - "grad_norm": 1.1960995788629245, + "grad_norm": 1.1912161993793942, "learning_rate": 8.1368506659329e-06, - "loss": 0.1798, + "loss": 0.1797, "step": 2077 }, { "epoch": 0.28426812585499317, - "grad_norm": 1.474829391728714, + "grad_norm": 1.4452209050097908, "learning_rate": 8.135177035108352e-06, - "loss": 0.2196, + "loss": 0.2149, "step": 2078 }, { "epoch": 0.2844049247606019, - "grad_norm": 1.2710285414219624, + "grad_norm": 1.250267245538619, "learning_rate": 8.13350282521915e-06, - "loss": 0.1969, + "loss": 0.1983, "step": 2079 }, { "epoch": 0.28454172366621067, - "grad_norm": 1.3814158073555962, + "grad_norm": 1.3671889056045166, "learning_rate": 8.131828036574526e-06, - "loss": 0.2333, + "loss": 0.2311, "step": 2080 }, { "epoch": 0.2846785225718194, - "grad_norm": 1.0676160213589985, + "grad_norm": 1.0510467161141928, "learning_rate": 8.130152669483806e-06, - "loss": 0.1671, + "loss": 0.1684, "step": 2081 }, { "epoch": 0.28481532147742816, - "grad_norm": 1.2602651162179872, + "grad_norm": 1.232085263196857, "learning_rate": 8.12847672425643e-06, - "loss": 0.1819, + "loss": 0.1834, "step": 2082 }, { "epoch": 0.2849521203830369, - "grad_norm": 1.350964419001931, + "grad_norm": 1.3309883776535238, "learning_rate": 8.126800201201948e-06, - "loss": 0.2048, + "loss": 0.2026, "step": 2083 }, { "epoch": 0.2850889192886457, - "grad_norm": 1.3546124307214107, + "grad_norm": 1.3401057693265401, "learning_rate": 8.125123100630009e-06, - "loss": 0.2, + "loss": 0.1993, "step": 2084 }, { "epoch": 0.28522571819425446, - "grad_norm": 1.2963301254435764, + "grad_norm": 1.2661095370604856, "learning_rate": 8.123445422850373e-06, - "loss": 0.2043, + "loss": 0.2037, "step": 2085 }, { "epoch": 0.2853625170998632, - "grad_norm": 1.175120241851998, + "grad_norm": 1.162512545200044, "learning_rate": 8.121767168172905e-06, - "loss": 0.1668, + "loss": 0.168, "step": 2086 }, { "epoch": 0.28549931600547196, - "grad_norm": 1.6397397558984304, + "grad_norm": 1.568462037096116, "learning_rate": 8.120088336907576e-06, - "loss": 0.2553, + "loss": 0.255, "step": 2087 }, { "epoch": 0.2856361149110807, - "grad_norm": 1.719122030264936, + "grad_norm": 1.450929822210299, "learning_rate": 8.118408929364469e-06, - "loss": 0.2221, + "loss": 0.2149, "step": 2088 }, { "epoch": 0.28577291381668946, - "grad_norm": 1.218282565468708, + "grad_norm": 1.171165436962859, "learning_rate": 8.116728945853765e-06, - "loss": 0.196, + "loss": 0.1936, "step": 2089 }, { "epoch": 0.2859097127222982, - "grad_norm": 1.4389623836719465, + "grad_norm": 1.4005441722254341, "learning_rate": 8.115048386685757e-06, - "loss": 0.2379, + "loss": 0.2332, "step": 2090 }, { "epoch": 0.28604651162790695, - "grad_norm": 1.5723645655258618, + "grad_norm": 1.537872020956767, "learning_rate": 8.113367252170845e-06, - "loss": 0.1849, + "loss": 0.1855, "step": 2091 }, { "epoch": 0.28618331053351576, - "grad_norm": 1.2932393242344276, + "grad_norm": 1.285092659799342, "learning_rate": 8.11168554261953e-06, - "loss": 0.2133, + "loss": 0.2136, "step": 2092 }, { "epoch": 0.2863201094391245, - "grad_norm": 1.4255143555608223, + "grad_norm": 1.4240019514309628, "learning_rate": 8.110003258342425e-06, - "loss": 0.2181, + "loss": 0.2193, "step": 2093 }, { "epoch": 0.28645690834473325, - "grad_norm": 1.472416800259548, + "grad_norm": 1.4746376344697067, "learning_rate": 8.108320399650244e-06, - "loss": 0.1956, + "loss": 0.191, "step": 2094 }, { "epoch": 0.286593707250342, - "grad_norm": 1.2504841736074885, + "grad_norm": 1.2330447955560477, "learning_rate": 8.106636966853811e-06, - "loss": 0.2037, + "loss": 0.2015, "step": 2095 }, { "epoch": 0.28673050615595075, - "grad_norm": 1.3971197698583042, + "grad_norm": 1.3613617546345282, "learning_rate": 8.104952960264056e-06, - "loss": 0.1809, + "loss": 0.1776, "step": 2096 }, { "epoch": 0.2868673050615595, - "grad_norm": 1.2360625403411003, + "grad_norm": 1.218138638011204, "learning_rate": 8.10326838019201e-06, - "loss": 0.1662, + "loss": 0.1661, "step": 2097 }, { "epoch": 0.28700410396716824, - "grad_norm": 1.3331187640281572, + "grad_norm": 1.335812030026102, "learning_rate": 8.10158322694882e-06, - "loss": 0.1824, + "loss": 0.1832, "step": 2098 }, { "epoch": 0.287140902872777, - "grad_norm": 1.3858765061936105, + "grad_norm": 1.3765197289903486, "learning_rate": 8.099897500845723e-06, - "loss": 0.23, + "loss": 0.2285, "step": 2099 }, { "epoch": 0.2872777017783858, - "grad_norm": 1.161848176589781, + "grad_norm": 1.14484890113658, "learning_rate": 8.09821120219408e-06, - "loss": 0.2003, + "loss": 0.2004, "step": 2100 }, { "epoch": 0.2872777017783858, - "eval_loss": 0.20087656378746033, - "eval_runtime": 5.8863, - "eval_samples_per_second": 5.097, - "eval_steps_per_second": 1.359, + "eval_loss": 0.20177191495895386, + "eval_runtime": 5.9399, + "eval_samples_per_second": 5.051, + "eval_steps_per_second": 1.347, "step": 2100 }, { "epoch": 0.28741450068399454, - "grad_norm": 1.3408954113869576, + "grad_norm": 1.3526078145243419, "learning_rate": 8.096524331305345e-06, - "loss": 0.2009, + "loss": 0.2037, "step": 2101 }, { "epoch": 0.2875512995896033, - "grad_norm": 1.5239378450769843, + "grad_norm": 1.5019321596175401, "learning_rate": 8.09483688849108e-06, - "loss": 0.2337, + "loss": 0.2291, "step": 2102 }, { "epoch": 0.28768809849521204, - "grad_norm": 1.4215423587590372, + "grad_norm": 1.3905769995062798, "learning_rate": 8.093148874062958e-06, - "loss": 0.2292, + "loss": 0.2309, "step": 2103 }, { "epoch": 0.2878248974008208, - "grad_norm": 1.3053808726182803, + "grad_norm": 1.2117088070730435, "learning_rate": 8.091460288332754e-06, - "loss": 0.1989, + "loss": 0.1964, "step": 2104 }, { "epoch": 0.28796169630642954, - "grad_norm": 1.1988954493815045, + "grad_norm": 1.196859279741213, "learning_rate": 8.089771131612344e-06, - "loss": 0.1812, + "loss": 0.1807, "step": 2105 }, { "epoch": 0.2880984952120383, - "grad_norm": 1.3142521188318168, + "grad_norm": 1.2999333402860145, "learning_rate": 8.088081404213719e-06, - "loss": 0.2141, + "loss": 0.215, "step": 2106 }, { "epoch": 0.28823529411764703, - "grad_norm": 1.316314590616722, + "grad_norm": 1.2967732608759968, "learning_rate": 8.086391106448965e-06, - "loss": 0.2035, + "loss": 0.2036, "step": 2107 }, { "epoch": 0.28837209302325584, - "grad_norm": 1.3178395101132623, + "grad_norm": 1.3061457081310561, "learning_rate": 8.084700238630283e-06, - "loss": 0.1811, + "loss": 0.1833, "step": 2108 }, { "epoch": 0.2885088919288646, - "grad_norm": 1.36620482414451, + "grad_norm": 1.332253288904461, "learning_rate": 8.083008801069974e-06, - "loss": 0.2154, + "loss": 0.2133, "step": 2109 }, { "epoch": 0.28864569083447333, - "grad_norm": 1.2217044066979588, + "grad_norm": 1.2153569334031584, "learning_rate": 8.081316794080444e-06, - "loss": 0.1488, + "loss": 0.1501, "step": 2110 }, { "epoch": 0.2887824897400821, - "grad_norm": 1.362650464249732, + "grad_norm": 1.3852193176901764, "learning_rate": 8.079624217974207e-06, - "loss": 0.2122, + "loss": 0.2107, "step": 2111 }, { "epoch": 0.28891928864569083, - "grad_norm": 0.932077566254447, + "grad_norm": 0.9113767916543175, "learning_rate": 8.077931073063879e-06, - "loss": 0.1488, + "loss": 0.1476, "step": 2112 }, { "epoch": 0.2890560875512996, - "grad_norm": 1.206244558907752, + "grad_norm": 1.250987825183306, "learning_rate": 8.076237359662182e-06, - "loss": 0.2038, + "loss": 0.208, "step": 2113 }, { "epoch": 0.2891928864569083, - "grad_norm": 1.1891611984272805, + "grad_norm": 1.1855646817227314, "learning_rate": 8.074543078081946e-06, - "loss": 0.1691, + "loss": 0.1675, "step": 2114 }, { "epoch": 0.2893296853625171, - "grad_norm": 1.623587691262347, + "grad_norm": 1.4544676653527544, "learning_rate": 8.072848228636101e-06, - "loss": 0.2292, + "loss": 0.2119, "step": 2115 }, { "epoch": 0.2894664842681259, - "grad_norm": 1.537270763740422, + "grad_norm": 1.490554496692603, "learning_rate": 8.071152811637684e-06, - "loss": 0.2315, + "loss": 0.2287, "step": 2116 }, { "epoch": 0.2896032831737346, - "grad_norm": 1.5646810290725612, + "grad_norm": 1.5313633025787807, "learning_rate": 8.069456827399839e-06, - "loss": 0.1837, + "loss": 0.1791, "step": 2117 }, { "epoch": 0.2897400820793434, - "grad_norm": 1.8459386665437676, + "grad_norm": 1.8303441398890545, "learning_rate": 8.067760276235813e-06, - "loss": 0.2168, + "loss": 0.2181, "step": 2118 }, { "epoch": 0.2898768809849521, - "grad_norm": 1.2220341087111901, + "grad_norm": 1.3633239220859583, "learning_rate": 8.066063158458954e-06, - "loss": 0.2249, + "loss": 0.2274, "step": 2119 }, { "epoch": 0.29001367989056087, - "grad_norm": 1.5057270612642202, + "grad_norm": 1.4755609841708668, "learning_rate": 8.064365474382724e-06, - "loss": 0.206, + "loss": 0.204, "step": 2120 }, { "epoch": 0.2901504787961696, - "grad_norm": 1.3989762137188153, + "grad_norm": 1.3704399566659977, "learning_rate": 8.062667224320679e-06, - "loss": 0.1959, + "loss": 0.1942, "step": 2121 }, { "epoch": 0.29028727770177837, - "grad_norm": 1.2209726460710815, + "grad_norm": 1.2023384805111321, "learning_rate": 8.06096840858649e-06, - "loss": 0.1876, + "loss": 0.1852, "step": 2122 }, { "epoch": 0.2904240766073871, - "grad_norm": 1.399812204455381, + "grad_norm": 1.395389733605756, "learning_rate": 8.059269027493918e-06, - "loss": 0.2075, + "loss": 0.2091, "step": 2123 }, { "epoch": 0.2905608755129959, - "grad_norm": 1.4049864219800698, + "grad_norm": 1.3545689979246018, "learning_rate": 8.057569081356845e-06, - "loss": 0.2227, + "loss": 0.2222, "step": 2124 }, { "epoch": 0.29069767441860467, - "grad_norm": 1.4622401875036897, + "grad_norm": 1.4196817631541392, "learning_rate": 8.055868570489247e-06, - "loss": 0.2104, + "loss": 0.2099, "step": 2125 }, { "epoch": 0.2908344733242134, - "grad_norm": 1.4607637910757516, + "grad_norm": 1.45285905551715, "learning_rate": 8.054167495205207e-06, - "loss": 0.2117, + "loss": 0.2098, "step": 2126 }, { "epoch": 0.29097127222982216, - "grad_norm": 1.4037310218552035, + "grad_norm": 1.4006250639866538, "learning_rate": 8.052465855818912e-06, - "loss": 0.1886, + "loss": 0.1891, "step": 2127 }, { "epoch": 0.2911080711354309, - "grad_norm": 1.3771716271971866, + "grad_norm": 1.4006911963054143, "learning_rate": 8.050763652644656e-06, - "loss": 0.2013, + "loss": 0.2067, "step": 2128 }, { "epoch": 0.29124487004103966, - "grad_norm": 1.1927356576811556, + "grad_norm": 1.2159413373548837, "learning_rate": 8.049060885996832e-06, - "loss": 0.1679, + "loss": 0.172, "step": 2129 }, { "epoch": 0.2913816689466484, - "grad_norm": 1.1765700971501594, + "grad_norm": 1.1471716262111924, "learning_rate": 8.047357556189937e-06, - "loss": 0.1871, + "loss": 0.1858, "step": 2130 }, { "epoch": 0.29151846785225716, - "grad_norm": 1.345554896627667, + "grad_norm": 1.3508275173027628, "learning_rate": 8.04565366353858e-06, - "loss": 0.2166, + "loss": 0.2218, "step": 2131 }, { "epoch": 0.29165526675786596, - "grad_norm": 1.2932245622884844, + "grad_norm": 1.2410191916692142, "learning_rate": 8.043949208357464e-06, - "loss": 0.1788, + "loss": 0.1752, "step": 2132 }, { "epoch": 0.2917920656634747, - "grad_norm": 1.5213171614917163, + "grad_norm": 1.5320644745409542, "learning_rate": 8.042244190961404e-06, - "loss": 0.2717, + "loss": 0.2726, "step": 2133 }, { "epoch": 0.29192886456908346, - "grad_norm": 1.2862693200902304, + "grad_norm": 1.2504894924512404, "learning_rate": 8.040538611665315e-06, - "loss": 0.1958, + "loss": 0.1924, "step": 2134 }, { "epoch": 0.2920656634746922, - "grad_norm": 1.2434776287092653, + "grad_norm": 1.2360406079341315, "learning_rate": 8.038832470784213e-06, - "loss": 0.2094, + "loss": 0.2071, "step": 2135 }, { "epoch": 0.29220246238030095, - "grad_norm": 1.6161777537968054, + "grad_norm": 1.5807832742543293, "learning_rate": 8.037125768633226e-06, - "loss": 0.2549, + "loss": 0.2546, "step": 2136 }, { "epoch": 0.2923392612859097, - "grad_norm": 1.3709167148369785, + "grad_norm": 1.355518263053431, "learning_rate": 8.035418505527574e-06, - "loss": 0.1817, + "loss": 0.1799, "step": 2137 }, { "epoch": 0.29247606019151845, - "grad_norm": 1.54655475971559, + "grad_norm": 1.5183384813988652, "learning_rate": 8.033710681782592e-06, - "loss": 0.211, + "loss": 0.2125, "step": 2138 }, { "epoch": 0.2926128590971272, - "grad_norm": 1.1833392237828677, + "grad_norm": 1.1696164288935142, "learning_rate": 8.032002297713713e-06, - "loss": 0.1674, + "loss": 0.1669, "step": 2139 }, { "epoch": 0.292749658002736, - "grad_norm": 1.368077709376696, + "grad_norm": 1.3683184287545058, "learning_rate": 8.030293353636471e-06, - "loss": 0.2204, + "loss": 0.2198, "step": 2140 }, { "epoch": 0.29288645690834475, - "grad_norm": 1.2210228319898566, + "grad_norm": 1.2194555671124658, "learning_rate": 8.028583849866511e-06, - "loss": 0.1843, + "loss": 0.1835, "step": 2141 }, { "epoch": 0.2930232558139535, - "grad_norm": 1.6145156703752852, + "grad_norm": 1.5534979173469454, "learning_rate": 8.026873786719574e-06, - "loss": 0.2482, + "loss": 0.2499, "step": 2142 }, { "epoch": 0.29316005471956225, - "grad_norm": 1.2078835406993753, + "grad_norm": 1.1617654504728028, "learning_rate": 8.02516316451151e-06, - "loss": 0.2048, + "loss": 0.2019, "step": 2143 }, { "epoch": 0.293296853625171, - "grad_norm": 1.360761350066377, + "grad_norm": 1.3575814020895607, "learning_rate": 8.023451983558266e-06, - "loss": 0.2003, + "loss": 0.1984, "step": 2144 }, { "epoch": 0.29343365253077974, - "grad_norm": 1.4966383904761391, + "grad_norm": 1.4963307734080624, "learning_rate": 8.021740244175896e-06, - "loss": 0.2385, + "loss": 0.2373, "step": 2145 }, { "epoch": 0.2935704514363885, - "grad_norm": 1.3616268010750363, + "grad_norm": 1.3382915245067006, "learning_rate": 8.02002794668056e-06, - "loss": 0.2153, + "loss": 0.2164, "step": 2146 }, { "epoch": 0.29370725034199724, - "grad_norm": 1.4011243089880547, + "grad_norm": 1.391716306079938, "learning_rate": 8.018315091388516e-06, - "loss": 0.2406, + "loss": 0.24, "step": 2147 }, { "epoch": 0.29384404924760604, - "grad_norm": 1.2210222457818274, + "grad_norm": 1.2334822490504036, "learning_rate": 8.016601678616128e-06, - "loss": 0.2047, + "loss": 0.2038, "step": 2148 }, { "epoch": 0.2939808481532148, - "grad_norm": 1.333573429566517, + "grad_norm": 1.3067269488721205, "learning_rate": 8.01488770867986e-06, - "loss": 0.2045, + "loss": 0.2036, "step": 2149 }, { "epoch": 0.29411764705882354, - "grad_norm": 1.1668789915013504, + "grad_norm": 1.1538168660605608, "learning_rate": 8.013173181896283e-06, - "loss": 0.163, + "loss": 0.1627, "step": 2150 }, { "epoch": 0.2942544459644323, - "grad_norm": 1.3810175219878171, + "grad_norm": 1.3894201664316954, "learning_rate": 8.011458098582068e-06, - "loss": 0.2101, + "loss": 0.2133, "step": 2151 }, { "epoch": 0.29439124487004104, - "grad_norm": 1.7469727458818334, + "grad_norm": 1.644093166016027, "learning_rate": 8.009742459053989e-06, - "loss": 0.2429, + "loss": 0.2345, "step": 2152 }, { "epoch": 0.2945280437756498, - "grad_norm": 1.516036379772808, + "grad_norm": 1.4893927018767479, "learning_rate": 8.008026263628921e-06, - "loss": 0.2083, + "loss": 0.207, "step": 2153 }, { "epoch": 0.29466484268125853, - "grad_norm": 1.4708272362342272, + "grad_norm": 1.4613767353411158, "learning_rate": 8.00630951262385e-06, "loss": 0.2479, "step": 2154 }, { "epoch": 0.2948016415868673, - "grad_norm": 1.3104883142964134, + "grad_norm": 1.2799181869959608, "learning_rate": 8.004592206355852e-06, - "loss": 0.2098, + "loss": 0.2081, "step": 2155 }, { "epoch": 0.2949384404924761, - "grad_norm": 1.7143109063289013, + "grad_norm": 1.6933426573483659, "learning_rate": 8.002874345142116e-06, - "loss": 0.2515, + "loss": 0.2516, "step": 2156 }, { "epoch": 0.29507523939808483, - "grad_norm": 1.3182066085378206, + "grad_norm": 1.2903137092539256, "learning_rate": 8.001155929299929e-06, - "loss": 0.1959, + "loss": 0.1957, "step": 2157 }, { "epoch": 0.2952120383036936, - "grad_norm": 1.504530032559904, + "grad_norm": 1.5084077986294904, "learning_rate": 7.99943695914668e-06, - "loss": 0.2324, + "loss": 0.2342, "step": 2158 }, { "epoch": 0.29534883720930233, - "grad_norm": 1.212332908988135, + "grad_norm": 1.1695575821718307, "learning_rate": 7.997717434999861e-06, - "loss": 0.1998, + "loss": 0.1979, "step": 2159 }, { "epoch": 0.2954856361149111, - "grad_norm": 1.4509753494954745, + "grad_norm": 1.4336030042803574, "learning_rate": 7.99599735717707e-06, - "loss": 0.2275, + "loss": 0.2273, "step": 2160 }, { "epoch": 0.2956224350205198, - "grad_norm": 1.3612982175030541, + "grad_norm": 1.3774536521905876, "learning_rate": 7.994276725996e-06, - "loss": 0.2023, + "loss": 0.2029, "step": 2161 }, { "epoch": 0.2957592339261286, - "grad_norm": 1.5482286084769308, + "grad_norm": 1.5234626585315034, "learning_rate": 7.992555541774452e-06, - "loss": 0.2911, + "loss": 0.2929, "step": 2162 }, { "epoch": 0.2958960328317373, - "grad_norm": 1.1999503222680958, + "grad_norm": 1.222939123736163, "learning_rate": 7.990833804830327e-06, - "loss": 0.1759, + "loss": 0.179, "step": 2163 }, { "epoch": 0.2960328317373461, - "grad_norm": 1.249395316339885, + "grad_norm": 1.2450420932190263, "learning_rate": 7.989111515481629e-06, - "loss": 0.1981, + "loss": 0.1982, "step": 2164 }, { "epoch": 0.2961696306429549, - "grad_norm": 1.3353542176307616, + "grad_norm": 1.3044935830744329, "learning_rate": 7.987388674046462e-06, - "loss": 0.2375, + "loss": 0.2354, "step": 2165 }, { "epoch": 0.2963064295485636, - "grad_norm": 1.2393715888805723, + "grad_norm": 1.2149833424304513, "learning_rate": 7.985665280843037e-06, - "loss": 0.1698, + "loss": 0.1709, "step": 2166 }, { "epoch": 0.29644322845417237, - "grad_norm": 1.3693952375635667, + "grad_norm": 1.3488098031174365, "learning_rate": 7.983941336189657e-06, - "loss": 0.2133, + "loss": 0.2108, "step": 2167 }, { "epoch": 0.2965800273597811, - "grad_norm": 1.2423636197017223, + "grad_norm": 1.2031056337116814, "learning_rate": 7.98221684040474e-06, "loss": 0.1889, "step": 2168 }, { "epoch": 0.29671682626538987, - "grad_norm": 1.3221326363103423, + "grad_norm": 1.2945356147583633, "learning_rate": 7.980491793806792e-06, - "loss": 0.2042, + "loss": 0.2043, "step": 2169 }, { "epoch": 0.2968536251709986, - "grad_norm": 1.8532175008667853, + "grad_norm": 1.7960065344631235, "learning_rate": 7.978766196714436e-06, - "loss": 0.227, + "loss": 0.2254, "step": 2170 }, { "epoch": 0.29699042407660736, - "grad_norm": 1.4347170822207975, + "grad_norm": 1.3828241198286841, "learning_rate": 7.977040049446381e-06, - "loss": 0.2403, + "loss": 0.237, "step": 2171 }, { "epoch": 0.29712722298221617, - "grad_norm": 1.0347357928202647, + "grad_norm": 1.0184603884984937, "learning_rate": 7.975313352321449e-06, - "loss": 0.1471, + "loss": 0.1457, "step": 2172 }, { "epoch": 0.2972640218878249, - "grad_norm": 1.3181924657070307, + "grad_norm": 1.3871123746624454, "learning_rate": 7.973586105658559e-06, - "loss": 0.2319, + "loss": 0.2359, "step": 2173 }, { "epoch": 0.29740082079343366, - "grad_norm": 1.5271975915376665, + "grad_norm": 1.5338611596084182, "learning_rate": 7.97185830977673e-06, - "loss": 0.2311, + "loss": 0.2297, "step": 2174 }, { "epoch": 0.2975376196990424, - "grad_norm": 1.0356076855313028, + "grad_norm": 1.0323026647075673, "learning_rate": 7.970129964995088e-06, - "loss": 0.1866, + "loss": 0.187, "step": 2175 }, { "epoch": 0.29767441860465116, - "grad_norm": 1.2652326316116789, + "grad_norm": 1.2760730004190637, "learning_rate": 7.968401071632854e-06, - "loss": 0.1756, + "loss": 0.1767, "step": 2176 }, { "epoch": 0.2978112175102599, - "grad_norm": 1.3677422059119604, + "grad_norm": 1.3316812965661113, "learning_rate": 7.966671630009354e-06, - "loss": 0.2207, + "loss": 0.2166, "step": 2177 }, { "epoch": 0.29794801641586866, - "grad_norm": 1.4593041863770255, + "grad_norm": 1.4121169841401504, "learning_rate": 7.964941640444015e-06, - "loss": 0.241, + "loss": 0.2404, "step": 2178 }, { "epoch": 0.2980848153214774, - "grad_norm": 1.2936479500855145, + "grad_norm": 1.3428486849885954, "learning_rate": 7.963211103256363e-06, - "loss": 0.1691, + "loss": 0.1664, "step": 2179 }, { "epoch": 0.2982216142270862, - "grad_norm": 1.3497256230745542, + "grad_norm": 1.3423476592551007, "learning_rate": 7.961480018766029e-06, - "loss": 0.1993, + "loss": 0.2015, "step": 2180 }, { "epoch": 0.29835841313269496, - "grad_norm": 1.482778055166626, + "grad_norm": 1.5004262221496711, "learning_rate": 7.959748387292741e-06, - "loss": 0.2533, + "loss": 0.2544, "step": 2181 }, { "epoch": 0.2984952120383037, - "grad_norm": 1.3957629913157816, + "grad_norm": 1.3619040325634804, "learning_rate": 7.958016209156332e-06, - "loss": 0.2283, + "loss": 0.2285, "step": 2182 }, { "epoch": 0.29863201094391245, - "grad_norm": 1.482597083233761, + "grad_norm": 1.423653137437064, "learning_rate": 7.956283484676732e-06, - "loss": 0.2064, + "loss": 0.2038, "step": 2183 }, { "epoch": 0.2987688098495212, - "grad_norm": 1.395236678177789, + "grad_norm": 1.1562404831002062, "learning_rate": 7.954550214173976e-06, - "loss": 0.2171, + "loss": 0.2177, "step": 2184 }, { "epoch": 0.29890560875512995, - "grad_norm": 1.528845336071817, + "grad_norm": 1.5501928028608105, "learning_rate": 7.952816397968195e-06, - "loss": 0.2417, + "loss": 0.246, "step": 2185 }, { "epoch": 0.2990424076607387, - "grad_norm": 1.6292342851233894, + "grad_norm": 1.6229937744933993, "learning_rate": 7.951082036379625e-06, - "loss": 0.2339, + "loss": 0.2355, "step": 2186 }, { "epoch": 0.29917920656634744, - "grad_norm": 1.311805582388438, + "grad_norm": 1.2835226813716591, "learning_rate": 7.949347129728602e-06, - "loss": 0.2145, + "loss": 0.215, "step": 2187 }, { "epoch": 0.29931600547195625, - "grad_norm": 1.2730216956924805, + "grad_norm": 1.2299131727975117, "learning_rate": 7.947611678335558e-06, - "loss": 0.1856, + "loss": 0.187, "step": 2188 }, { "epoch": 0.299452804377565, - "grad_norm": 1.3375676663612541, + "grad_norm": 1.3177652311749013, "learning_rate": 7.945875682521034e-06, - "loss": 0.232, + "loss": 0.2266, "step": 2189 }, { "epoch": 0.29958960328317374, - "grad_norm": 1.3003261040031069, + "grad_norm": 1.2696179099397549, "learning_rate": 7.944139142605665e-06, - "loss": 0.1904, + "loss": 0.1874, "step": 2190 }, { "epoch": 0.2997264021887825, - "grad_norm": 1.4211504249460614, + "grad_norm": 1.4225991637680846, "learning_rate": 7.942402058910189e-06, - "loss": 0.1954, + "loss": 0.1982, "step": 2191 }, { "epoch": 0.29986320109439124, - "grad_norm": 1.472806353451872, + "grad_norm": 2.824470822613693, "learning_rate": 7.940664431755443e-06, - "loss": 0.2092, + "loss": 0.2096, "step": 2192 }, { "epoch": 0.3, - "grad_norm": 1.6411848393068487, + "grad_norm": 1.6451318325184285, "learning_rate": 7.938926261462366e-06, - "loss": 0.2465, + "loss": 0.2479, "step": 2193 }, { "epoch": 0.30013679890560874, - "grad_norm": 1.5218529947816277, + "grad_norm": 1.5010624219803463, "learning_rate": 7.937187548351997e-06, - "loss": 0.2094, + "loss": 0.2105, "step": 2194 }, { "epoch": 0.3002735978112175, - "grad_norm": 0.9618206578886191, + "grad_norm": 0.9582774077406268, "learning_rate": 7.935448292745476e-06, - "loss": 0.1673, + "loss": 0.1678, "step": 2195 }, { "epoch": 0.3004103967168263, - "grad_norm": 1.5317335333202582, + "grad_norm": 1.5143472014331678, "learning_rate": 7.933708494964038e-06, - "loss": 0.2228, + "loss": 0.2265, "step": 2196 }, { "epoch": 0.30054719562243504, - "grad_norm": 1.3395841676845013, + "grad_norm": 1.3465549520955615, "learning_rate": 7.931968155329024e-06, - "loss": 0.2186, + "loss": 0.2192, "step": 2197 }, { "epoch": 0.3006839945280438, - "grad_norm": 1.1981894037251295, + "grad_norm": 1.1780598377740312, "learning_rate": 7.930227274161878e-06, - "loss": 0.2032, + "loss": 0.2001, "step": 2198 }, { "epoch": 0.30082079343365253, - "grad_norm": 1.2456002413174372, + "grad_norm": 1.1840800229073518, "learning_rate": 7.928485851784134e-06, - "loss": 0.1974, + "loss": 0.1968, "step": 2199 }, { "epoch": 0.3009575923392613, - "grad_norm": 1.6219524300142754, + "grad_norm": 1.5896035051342277, "learning_rate": 7.92674388851743e-06, - "loss": 0.2205, + "loss": 0.2174, "step": 2200 }, { "epoch": 0.3009575923392613, - "eval_loss": 0.2029787003993988, - "eval_runtime": 5.9317, - "eval_samples_per_second": 5.058, - "eval_steps_per_second": 1.349, + "eval_loss": 0.2038840353488922, + "eval_runtime": 5.9433, + "eval_samples_per_second": 5.048, + "eval_steps_per_second": 1.346, "step": 2200 }, { "epoch": 0.30109439124487003, - "grad_norm": 1.2470060161639307, + "grad_norm": 1.189791200291389, "learning_rate": 7.92500138468351e-06, - "loss": 0.1969, + "loss": 0.1965, "step": 2201 }, { "epoch": 0.3012311901504788, - "grad_norm": 1.3644573022696402, + "grad_norm": 1.3359921859331603, "learning_rate": 7.923258340604212e-06, - "loss": 0.1937, + "loss": 0.1914, "step": 2202 }, { "epoch": 0.3013679890560875, - "grad_norm": 1.5227983159881457, + "grad_norm": 1.4926254398668282, "learning_rate": 7.921514756601474e-06, - "loss": 0.2266, + "loss": 0.2202, "step": 2203 }, { "epoch": 0.30150478796169633, - "grad_norm": 1.4288967459992845, + "grad_norm": 1.4200300976065583, "learning_rate": 7.919770632997331e-06, - "loss": 0.1952, + "loss": 0.1944, "step": 2204 }, { "epoch": 0.3016415868673051, - "grad_norm": 1.3092078721184024, + "grad_norm": 1.2782761612184885, "learning_rate": 7.918025970113929e-06, - "loss": 0.1804, + "loss": 0.1791, "step": 2205 }, { "epoch": 0.3017783857729138, - "grad_norm": 1.182229850642715, + "grad_norm": 1.1944853596933636, "learning_rate": 7.916280768273499e-06, - "loss": 0.1627, + "loss": 0.1647, "step": 2206 }, { "epoch": 0.3019151846785226, - "grad_norm": 1.3187326970313407, + "grad_norm": 1.2989053641923023, "learning_rate": 7.914535027798382e-06, - "loss": 0.1971, + "loss": 0.1973, "step": 2207 }, { "epoch": 0.3020519835841313, - "grad_norm": 1.6220549754164968, + "grad_norm": 1.6131999488501059, "learning_rate": 7.912788749011014e-06, - "loss": 0.213, + "loss": 0.2106, "step": 2208 }, { "epoch": 0.30218878248974007, - "grad_norm": 1.3337269957915758, + "grad_norm": 1.3322178710397872, "learning_rate": 7.91104193223393e-06, - "loss": 0.2184, + "loss": 0.2185, "step": 2209 }, { "epoch": 0.3023255813953488, - "grad_norm": 1.1900098138398747, + "grad_norm": 1.2109142763496972, "learning_rate": 7.909294577789765e-06, - "loss": 0.1834, + "loss": 0.1922, "step": 2210 }, { "epoch": 0.30246238030095757, - "grad_norm": 1.4397207229640065, + "grad_norm": 1.441214440275024, "learning_rate": 7.907546686001258e-06, - "loss": 0.2402, + "loss": 0.2435, "step": 2211 }, { "epoch": 0.30259917920656637, - "grad_norm": 1.320007364910871, + "grad_norm": 1.2969563336509669, "learning_rate": 7.905798257191237e-06, - "loss": 0.217, + "loss": 0.2147, "step": 2212 }, { "epoch": 0.3027359781121751, - "grad_norm": 1.1891780573847022, + "grad_norm": 1.1763880589625135, "learning_rate": 7.904049291682643e-06, - "loss": 0.1699, + "loss": 0.1681, "step": 2213 }, { "epoch": 0.30287277701778387, - "grad_norm": 1.3273898384639924, + "grad_norm": 1.3012665794543754, "learning_rate": 7.902299789798504e-06, - "loss": 0.1875, + "loss": 0.1865, "step": 2214 }, { "epoch": 0.3030095759233926, - "grad_norm": 1.2245002246118857, + "grad_norm": 1.2167013517156138, "learning_rate": 7.90054975186195e-06, - "loss": 0.1967, + "loss": 0.1959, "step": 2215 }, { "epoch": 0.30314637482900136, - "grad_norm": 1.3368112335564395, + "grad_norm": 1.317207279319323, "learning_rate": 7.898799178196217e-06, - "loss": 0.1893, + "loss": 0.1883, "step": 2216 }, { "epoch": 0.3032831737346101, - "grad_norm": 1.5634687599888273, + "grad_norm": 1.5372565363605728, "learning_rate": 7.897048069124628e-06, - "loss": 0.2335, + "loss": 0.2329, "step": 2217 }, { "epoch": 0.30341997264021886, - "grad_norm": 1.0922698679890501, + "grad_norm": 1.1031935610413448, "learning_rate": 7.895296424970617e-06, - "loss": 0.1687, + "loss": 0.1678, "step": 2218 }, { "epoch": 0.3035567715458276, - "grad_norm": 1.3963531848598136, + "grad_norm": 1.3851652141405983, "learning_rate": 7.893544246057708e-06, "loss": 0.2224, "step": 2219 }, { "epoch": 0.3036935704514364, - "grad_norm": 1.3459871447722738, + "grad_norm": 1.3362235127747053, "learning_rate": 7.891791532709527e-06, - "loss": 0.218, + "loss": 0.217, "step": 2220 }, { "epoch": 0.30383036935704516, - "grad_norm": 1.2729925044587203, + "grad_norm": 1.2592385573479619, "learning_rate": 7.890038285249802e-06, - "loss": 0.161, + "loss": 0.1595, "step": 2221 }, { "epoch": 0.3039671682626539, - "grad_norm": 1.0786426248992564, + "grad_norm": 1.0704198772068267, "learning_rate": 7.888284504002352e-06, - "loss": 0.1792, + "loss": 0.1774, "step": 2222 }, { "epoch": 0.30410396716826266, - "grad_norm": 1.2047837425093881, + "grad_norm": 1.1801529884854471, "learning_rate": 7.886530189291103e-06, - "loss": 0.2241, + "loss": 0.2239, "step": 2223 }, { "epoch": 0.3042407660738714, - "grad_norm": 1.2853194520656641, + "grad_norm": 1.2487098611217762, "learning_rate": 7.884775341440071e-06, - "loss": 0.1986, + "loss": 0.198, "step": 2224 }, { "epoch": 0.30437756497948015, - "grad_norm": 1.142983371259395, + "grad_norm": 1.1479621110749054, "learning_rate": 7.883019960773381e-06, - "loss": 0.1642, + "loss": 0.1651, "step": 2225 }, { "epoch": 0.3045143638850889, - "grad_norm": 1.3972675205607066, + "grad_norm": 1.3728244604877897, "learning_rate": 7.881264047615244e-06, - "loss": 0.2246, + "loss": 0.2237, "step": 2226 }, { "epoch": 0.30465116279069765, - "grad_norm": 1.2444423333003667, + "grad_norm": 1.2256298302765825, "learning_rate": 7.879507602289979e-06, - "loss": 0.1982, + "loss": 0.1981, "step": 2227 }, { "epoch": 0.30478796169630645, - "grad_norm": 1.241059835047247, + "grad_norm": 1.2083536054211175, "learning_rate": 7.877750625122e-06, - "loss": 0.2029, + "loss": 0.2017, "step": 2228 }, { "epoch": 0.3049247606019152, - "grad_norm": 1.2646650859212827, + "grad_norm": 1.2641506885786145, "learning_rate": 7.87599311643582e-06, - "loss": 0.1576, + "loss": 0.1619, "step": 2229 }, { "epoch": 0.30506155950752395, - "grad_norm": 1.1040426261822651, + "grad_norm": 1.0912990998923886, "learning_rate": 7.874235076556046e-06, - "loss": 0.18, + "loss": 0.1778, "step": 2230 }, { "epoch": 0.3051983584131327, - "grad_norm": 1.3623268275613882, + "grad_norm": 1.340633919491587, "learning_rate": 7.872476505807392e-06, - "loss": 0.1967, + "loss": 0.1954, "step": 2231 }, { "epoch": 0.30533515731874145, - "grad_norm": 1.4841215401975836, + "grad_norm": 1.4573302152364351, "learning_rate": 7.87071740451466e-06, - "loss": 0.2053, + "loss": 0.203, "step": 2232 }, { "epoch": 0.3054719562243502, - "grad_norm": 1.266173098241158, + "grad_norm": 1.238795196522889, "learning_rate": 7.868957773002757e-06, - "loss": 0.202, + "loss": 0.2025, "step": 2233 }, { "epoch": 0.30560875512995894, - "grad_norm": 1.477862842138608, + "grad_norm": 1.4370376301360672, "learning_rate": 7.867197611596684e-06, - "loss": 0.2069, + "loss": 0.205, "step": 2234 }, { "epoch": 0.3057455540355677, - "grad_norm": 1.5280298143777273, + "grad_norm": 1.4792503960837038, "learning_rate": 7.865436920621541e-06, - "loss": 0.2474, + "loss": 0.2432, "step": 2235 }, { "epoch": 0.3058823529411765, - "grad_norm": 1.370898405413453, + "grad_norm": 1.42611475206145, "learning_rate": 7.863675700402527e-06, - "loss": 0.2256, + "loss": 0.2279, "step": 2236 }, { "epoch": 0.30601915184678524, - "grad_norm": 1.3382147474495676, + "grad_norm": 1.3533038247820381, "learning_rate": 7.86191395126494e-06, - "loss": 0.2376, + "loss": 0.2377, "step": 2237 }, { "epoch": 0.306155950752394, - "grad_norm": 1.417306214269472, + "grad_norm": 1.406422073784709, "learning_rate": 7.86015167353417e-06, - "loss": 0.2201, + "loss": 0.219, "step": 2238 }, { "epoch": 0.30629274965800274, - "grad_norm": 1.3661711465179278, + "grad_norm": 1.352035237384152, "learning_rate": 7.85838886753571e-06, - "loss": 0.224, + "loss": 0.2231, "step": 2239 }, { "epoch": 0.3064295485636115, - "grad_norm": 1.3068071230021747, + "grad_norm": 1.2601509525636605, "learning_rate": 7.85662553359515e-06, - "loss": 0.2107, + "loss": 0.2077, "step": 2240 }, { "epoch": 0.30656634746922024, - "grad_norm": 1.481385439501141, + "grad_norm": 1.4503853960397155, "learning_rate": 7.854861672038173e-06, - "loss": 0.2542, + "loss": 0.2506, "step": 2241 }, { "epoch": 0.306703146374829, - "grad_norm": 1.4423749529445897, + "grad_norm": 1.4232631625068988, "learning_rate": 7.853097283190568e-06, - "loss": 0.2553, + "loss": 0.2535, "step": 2242 }, { "epoch": 0.30683994528043773, - "grad_norm": 1.2276381785997208, + "grad_norm": 1.2206044951855572, "learning_rate": 7.851332367378211e-06, - "loss": 0.1685, + "loss": 0.1695, "step": 2243 }, { "epoch": 0.30697674418604654, - "grad_norm": 1.361859944503387, + "grad_norm": 1.346241458544996, "learning_rate": 7.849566924927082e-06, - "loss": 0.2201, + "loss": 0.2193, "step": 2244 }, { "epoch": 0.3071135430916553, - "grad_norm": 1.416737844945469, + "grad_norm": 1.4748052454499418, "learning_rate": 7.847800956163259e-06, - "loss": 0.2248, + "loss": 0.2298, "step": 2245 }, { "epoch": 0.30725034199726403, - "grad_norm": 1.0962666751843142, + "grad_norm": 1.0947285200403605, "learning_rate": 7.846034461412913e-06, - "loss": 0.1665, + "loss": 0.1688, "step": 2246 }, { "epoch": 0.3073871409028728, - "grad_norm": 1.4380054679560674, + "grad_norm": 1.4561551505381605, "learning_rate": 7.844267441002315e-06, - "loss": 0.2216, + "loss": 0.2243, "step": 2247 }, { "epoch": 0.30752393980848153, - "grad_norm": 1.4695168038085147, + "grad_norm": 1.3079780392599707, "learning_rate": 7.842499895257832e-06, - "loss": 0.19, + "loss": 0.1907, "step": 2248 }, { "epoch": 0.3076607387140903, - "grad_norm": 1.1864284097564315, + "grad_norm": 1.1986461837292772, "learning_rate": 7.840731824505927e-06, - "loss": 0.1545, + "loss": 0.155, "step": 2249 }, { "epoch": 0.307797537619699, - "grad_norm": 1.53999677926837, + "grad_norm": 1.5566180488986197, "learning_rate": 7.838963229073163e-06, - "loss": 0.219, + "loss": 0.2202, "step": 2250 }, { "epoch": 0.3079343365253078, - "grad_norm": 1.3651251095699752, + "grad_norm": 1.3552106823822754, "learning_rate": 7.837194109286197e-06, - "loss": 0.1985, + "loss": 0.1971, "step": 2251 }, { "epoch": 0.3080711354309166, - "grad_norm": 1.298446967001383, + "grad_norm": 1.2834250462666708, "learning_rate": 7.835424465471784e-06, - "loss": 0.2014, + "loss": 0.1986, "step": 2252 }, { "epoch": 0.3082079343365253, - "grad_norm": 1.0958759796203899, + "grad_norm": 1.101076311214104, "learning_rate": 7.833654297956777e-06, - "loss": 0.1922, + "loss": 0.1927, "step": 2253 }, { "epoch": 0.3083447332421341, - "grad_norm": 1.3512291248286552, + "grad_norm": 1.323485267870425, "learning_rate": 7.831883607068124e-06, - "loss": 0.2042, + "loss": 0.2033, "step": 2254 }, { "epoch": 0.3084815321477428, - "grad_norm": 1.2990272873887416, + "grad_norm": 1.275123398321994, "learning_rate": 7.83011239313287e-06, - "loss": 0.1985, + "loss": 0.1984, "step": 2255 }, { "epoch": 0.30861833105335157, - "grad_norm": 1.213135303685564, + "grad_norm": 1.1836540332092864, "learning_rate": 7.828340656478158e-06, - "loss": 0.2193, + "loss": 0.2153, "step": 2256 }, { "epoch": 0.3087551299589603, - "grad_norm": 1.225237184105807, + "grad_norm": 1.2350605651316755, "learning_rate": 7.826568397431223e-06, - "loss": 0.2078, + "loss": 0.208, "step": 2257 }, { "epoch": 0.30889192886456907, - "grad_norm": 1.4878393091946691, + "grad_norm": 1.4673205565868526, "learning_rate": 7.824795616319402e-06, - "loss": 0.2305, + "loss": 0.2282, "step": 2258 }, { "epoch": 0.3090287277701778, - "grad_norm": 1.6983848325646562, + "grad_norm": 1.715495177133437, "learning_rate": 7.823022313470126e-06, - "loss": 0.2264, + "loss": 0.2273, "step": 2259 }, { "epoch": 0.3091655266757866, - "grad_norm": 1.1686121758535937, + "grad_norm": 1.1633977200256287, "learning_rate": 7.821248489210921e-06, - "loss": 0.1935, + "loss": 0.1909, "step": 2260 }, { "epoch": 0.30930232558139537, - "grad_norm": 1.0769414065013765, + "grad_norm": 1.1002582843986095, "learning_rate": 7.819474143869414e-06, - "loss": 0.1779, + "loss": 0.1804, "step": 2261 }, { "epoch": 0.3094391244870041, - "grad_norm": 1.4088174121864412, + "grad_norm": 1.4126518530269774, "learning_rate": 7.817699277773325e-06, - "loss": 0.2565, + "loss": 0.2573, "step": 2262 }, { "epoch": 0.30957592339261286, - "grad_norm": 1.7357885281555347, + "grad_norm": 1.7675158656618863, "learning_rate": 7.815923891250468e-06, - "loss": 0.2581, + "loss": 0.2582, "step": 2263 }, { "epoch": 0.3097127222982216, - "grad_norm": 1.5236559856961323, + "grad_norm": 1.4907753552603313, "learning_rate": 7.814147984628757e-06, - "loss": 0.2327, + "loss": 0.2315, "step": 2264 }, { "epoch": 0.30984952120383036, - "grad_norm": 1.4046320707453115, + "grad_norm": 1.3915023759695861, "learning_rate": 7.812371558236199e-06, - "loss": 0.229, + "loss": 0.2301, "step": 2265 }, { "epoch": 0.3099863201094391, - "grad_norm": 1.2282758513849559, + "grad_norm": 1.239274410502657, "learning_rate": 7.810594612400899e-06, - "loss": 0.1884, + "loss": 0.1911, "step": 2266 }, { "epoch": 0.31012311901504785, - "grad_norm": 1.2964936460730931, + "grad_norm": 1.2841243417109325, "learning_rate": 7.80881714745106e-06, - "loss": 0.2053, + "loss": 0.2045, "step": 2267 }, { "epoch": 0.31025991792065666, - "grad_norm": 1.287200864847025, + "grad_norm": 1.2739012866718709, "learning_rate": 7.807039163714974e-06, - "loss": 0.1926, + "loss": 0.1916, "step": 2268 }, { "epoch": 0.3103967168262654, - "grad_norm": 1.2656899414125264, + "grad_norm": 1.2720300432797587, "learning_rate": 7.805260661521036e-06, - "loss": 0.204, + "loss": 0.2049, "step": 2269 }, { "epoch": 0.31053351573187415, - "grad_norm": 1.459288825888846, + "grad_norm": 1.4640099961339685, "learning_rate": 7.803481641197734e-06, - "loss": 0.2058, + "loss": 0.2075, "step": 2270 }, { "epoch": 0.3106703146374829, - "grad_norm": 1.327995300882914, + "grad_norm": 1.3265814078688904, "learning_rate": 7.801702103073651e-06, - "loss": 0.2212, + "loss": 0.2199, "step": 2271 }, { "epoch": 0.31080711354309165, - "grad_norm": 1.2657848022279194, + "grad_norm": 1.2697142294650616, "learning_rate": 7.799922047477465e-06, "loss": 0.1666, "step": 2272 }, { "epoch": 0.3109439124487004, - "grad_norm": 1.3267135196265727, + "grad_norm": 1.283033365796261, "learning_rate": 7.798141474737953e-06, - "loss": 0.177, + "loss": 0.1759, "step": 2273 }, { "epoch": 0.31108071135430915, - "grad_norm": 1.1965123415762604, + "grad_norm": 1.1653607954834861, "learning_rate": 7.796360385183983e-06, - "loss": 0.2188, + "loss": 0.2152, "step": 2274 }, { "epoch": 0.3112175102599179, - "grad_norm": 1.2531685127538459, + "grad_norm": 1.2342897326452207, "learning_rate": 7.794578779144523e-06, - "loss": 0.2336, + "loss": 0.2317, "step": 2275 }, { "epoch": 0.3113543091655267, - "grad_norm": 1.3001371976675609, + "grad_norm": 1.2921374030801658, "learning_rate": 7.792796656948634e-06, - "loss": 0.1829, + "loss": 0.1822, "step": 2276 }, { "epoch": 0.31149110807113545, - "grad_norm": 1.3046102612560397, + "grad_norm": 1.3289578422963022, "learning_rate": 7.791014018925471e-06, - "loss": 0.204, + "loss": 0.2055, "step": 2277 }, { "epoch": 0.3116279069767442, - "grad_norm": 0.8863231716886043, + "grad_norm": 0.8656391080554537, "learning_rate": 7.789230865404287e-06, - "loss": 0.1413, + "loss": 0.1403, "step": 2278 }, { "epoch": 0.31176470588235294, - "grad_norm": 1.4240443249635664, + "grad_norm": 1.4267235811042551, "learning_rate": 7.787447196714428e-06, - "loss": 0.2251, + "loss": 0.2235, "step": 2279 }, { "epoch": 0.3119015047879617, - "grad_norm": 1.0574888027795801, + "grad_norm": 1.054954879964646, "learning_rate": 7.785663013185338e-06, "loss": 0.1683, "step": 2280 }, { "epoch": 0.31203830369357044, - "grad_norm": 1.1441526024894342, + "grad_norm": 1.1380787913819255, "learning_rate": 7.783878315146551e-06, - "loss": 0.1935, + "loss": 0.1931, "step": 2281 }, { "epoch": 0.3121751025991792, - "grad_norm": 1.2502542709427875, + "grad_norm": 1.2140244081291391, "learning_rate": 7.782093102927704e-06, - "loss": 0.1753, + "loss": 0.1731, "step": 2282 }, { "epoch": 0.31231190150478794, - "grad_norm": 1.5457025180191362, + "grad_norm": 1.4936825345597842, "learning_rate": 7.78030737685852e-06, - "loss": 0.2267, + "loss": 0.2257, "step": 2283 }, { "epoch": 0.31244870041039674, - "grad_norm": 1.0919732342900323, + "grad_norm": 1.0954067379015242, "learning_rate": 7.778521137268822e-06, - "loss": 0.1502, + "loss": 0.1498, "step": 2284 }, { "epoch": 0.3125854993160055, - "grad_norm": 1.1748401666328732, + "grad_norm": 1.146184568343839, "learning_rate": 7.776734384488529e-06, - "loss": 0.1878, + "loss": 0.1874, "step": 2285 }, { "epoch": 0.31272229822161424, - "grad_norm": 1.2549394905351068, + "grad_norm": 1.246366372861948, "learning_rate": 7.774947118847651e-06, - "loss": 0.2215, + "loss": 0.2233, "step": 2286 }, { "epoch": 0.312859097127223, - "grad_norm": 0.9844325757701266, + "grad_norm": 0.9886286458057192, "learning_rate": 7.773159340676296e-06, - "loss": 0.142, + "loss": 0.1421, "step": 2287 }, { "epoch": 0.31299589603283173, - "grad_norm": 1.332784500799167, + "grad_norm": 1.3542658971681856, "learning_rate": 7.771371050304662e-06, - "loss": 0.1958, + "loss": 0.1972, "step": 2288 }, { "epoch": 0.3131326949384405, - "grad_norm": 0.9930429358541218, + "grad_norm": 0.9699669392205592, "learning_rate": 7.769582248063048e-06, - "loss": 0.1704, + "loss": 0.1695, "step": 2289 }, { "epoch": 0.31326949384404923, - "grad_norm": 1.4741937678392827, + "grad_norm": 1.4490587242728168, "learning_rate": 7.767792934281844e-06, - "loss": 0.2007, + "loss": 0.2021, "step": 2290 }, { "epoch": 0.313406292749658, - "grad_norm": 1.288873577661839, + "grad_norm": 1.2672978423443846, "learning_rate": 7.766003109291533e-06, - "loss": 0.1938, + "loss": 0.1904, "step": 2291 }, { "epoch": 0.3135430916552668, - "grad_norm": 1.425833773271014, + "grad_norm": 1.418936705780312, "learning_rate": 7.764212773422695e-06, - "loss": 0.1971, + "loss": 0.1998, "step": 2292 }, { "epoch": 0.31367989056087553, - "grad_norm": 1.366410932000871, + "grad_norm": 1.333219590431544, "learning_rate": 7.762421927006005e-06, - "loss": 0.1994, + "loss": 0.196, "step": 2293 }, { "epoch": 0.3138166894664843, - "grad_norm": 1.468784866333206, + "grad_norm": 1.471356098502254, "learning_rate": 7.760630570372229e-06, - "loss": 0.2106, + "loss": 0.2093, "step": 2294 }, { "epoch": 0.313953488372093, - "grad_norm": 1.3772689463303147, + "grad_norm": 1.3801332710027796, "learning_rate": 7.75883870385223e-06, - "loss": 0.1903, + "loss": 0.1899, "step": 2295 }, { "epoch": 0.3140902872777018, - "grad_norm": 1.2111993143673463, + "grad_norm": 1.2062145303018752, "learning_rate": 7.757046327776964e-06, - "loss": 0.1991, + "loss": 0.1992, "step": 2296 }, { "epoch": 0.3142270861833105, - "grad_norm": 1.2853653149974307, + "grad_norm": 1.2571609291573866, "learning_rate": 7.755253442477482e-06, - "loss": 0.2033, + "loss": 0.1991, "step": 2297 }, { "epoch": 0.31436388508891927, - "grad_norm": 1.1275553854703892, + "grad_norm": 1.1223259716625962, "learning_rate": 7.753460048284929e-06, - "loss": 0.1881, + "loss": 0.1886, "step": 2298 }, { "epoch": 0.314500683994528, - "grad_norm": 1.3748585472758998, + "grad_norm": 1.3461236360159776, "learning_rate": 7.751666145530541e-06, - "loss": 0.22, + "loss": 0.2181, "step": 2299 }, { "epoch": 0.3146374829001368, - "grad_norm": 1.349965756978964, + "grad_norm": 1.2900102531202788, "learning_rate": 7.749871734545651e-06, - "loss": 0.2279, + "loss": 0.2266, "step": 2300 }, { "epoch": 0.3146374829001368, - "eval_loss": 0.20239242911338806, - "eval_runtime": 5.9239, - "eval_samples_per_second": 5.064, + "eval_loss": 0.20296329259872437, + "eval_runtime": 5.9256, + "eval_samples_per_second": 5.063, "eval_steps_per_second": 1.35, "step": 2300 }, { "epoch": 0.31477428180574557, - "grad_norm": 1.221107968221609, + "grad_norm": 1.2271490517955168, "learning_rate": 7.748076815661688e-06, - "loss": 0.1708, + "loss": 0.1688, "step": 2301 }, { "epoch": 0.3149110807113543, - "grad_norm": 1.4798141325284095, + "grad_norm": 1.456124810933472, "learning_rate": 7.746281389210171e-06, - "loss": 0.25, + "loss": 0.2495, "step": 2302 }, { "epoch": 0.31504787961696307, - "grad_norm": 1.4522620176537508, + "grad_norm": 1.4544222738012875, "learning_rate": 7.74448545552271e-06, - "loss": 0.2393, + "loss": 0.2409, "step": 2303 }, { "epoch": 0.3151846785225718, - "grad_norm": 1.2151778061812137, + "grad_norm": 1.2238638859466455, "learning_rate": 7.742689014931017e-06, - "loss": 0.1737, + "loss": 0.1746, "step": 2304 }, { "epoch": 0.31532147742818056, - "grad_norm": 1.5050313311352208, + "grad_norm": 1.5085649843310913, "learning_rate": 7.740892067766892e-06, - "loss": 0.2376, + "loss": 0.2382, "step": 2305 }, { "epoch": 0.3154582763337893, - "grad_norm": 1.2082110378125046, + "grad_norm": 1.1920730485234983, "learning_rate": 7.73909461436223e-06, - "loss": 0.199, + "loss": 0.2016, "step": 2306 }, { "epoch": 0.31559507523939806, - "grad_norm": 1.438600397487521, + "grad_norm": 1.430940808714945, "learning_rate": 7.737296655049017e-06, - "loss": 0.2417, + "loss": 0.2434, "step": 2307 }, { "epoch": 0.31573187414500686, - "grad_norm": 1.2874420845374044, + "grad_norm": 1.2724305963779738, "learning_rate": 7.735498190159336e-06, - "loss": 0.2218, + "loss": 0.2216, "step": 2308 }, { "epoch": 0.3158686730506156, - "grad_norm": 1.2759876495139635, + "grad_norm": 1.2520837759197465, "learning_rate": 7.733699220025362e-06, - "loss": 0.1987, + "loss": 0.1958, "step": 2309 }, { "epoch": 0.31600547195622436, - "grad_norm": 1.4790658263390473, + "grad_norm": 1.461319868104535, "learning_rate": 7.731899744979364e-06, - "loss": 0.2508, + "loss": 0.2526, "step": 2310 }, { "epoch": 0.3161422708618331, - "grad_norm": 1.235994819992297, + "grad_norm": 1.244180112237757, "learning_rate": 7.730099765353702e-06, - "loss": 0.1965, + "loss": 0.1997, "step": 2311 }, { "epoch": 0.31627906976744186, - "grad_norm": 1.2957737059459304, + "grad_norm": 1.262968882076043, "learning_rate": 7.728299281480833e-06, - "loss": 0.1817, + "loss": 0.1823, "step": 2312 }, { "epoch": 0.3164158686730506, - "grad_norm": 1.555231452448495, + "grad_norm": 1.5369463498297764, "learning_rate": 7.726498293693303e-06, - "loss": 0.2346, + "loss": 0.2353, "step": 2313 }, { "epoch": 0.31655266757865935, - "grad_norm": 1.3250753551258045, + "grad_norm": 1.3007496959034248, "learning_rate": 7.724696802323755e-06, - "loss": 0.19, + "loss": 0.1908, "step": 2314 }, { "epoch": 0.3166894664842681, - "grad_norm": 1.5715275758698433, + "grad_norm": 1.561215899851659, "learning_rate": 7.72289480770492e-06, - "loss": 0.2193, + "loss": 0.2163, "step": 2315 }, { "epoch": 0.3168262653898769, - "grad_norm": 1.395229629297669, + "grad_norm": 1.3904544758705593, "learning_rate": 7.721092310169626e-06, - "loss": 0.2179, + "loss": 0.2186, "step": 2316 }, { "epoch": 0.31696306429548565, - "grad_norm": 1.247903786757419, + "grad_norm": 1.226948541868683, "learning_rate": 7.719289310050795e-06, - "loss": 0.1693, + "loss": 0.169, "step": 2317 }, { "epoch": 0.3170998632010944, - "grad_norm": 1.0912105134804417, + "grad_norm": 1.1033300894543867, "learning_rate": 7.717485807681437e-06, - "loss": 0.1775, + "loss": 0.1785, "step": 2318 }, { "epoch": 0.31723666210670315, - "grad_norm": 1.4411578448531024, + "grad_norm": 1.4399559243606321, "learning_rate": 7.71568180339466e-06, - "loss": 0.2456, + "loss": 0.2451, "step": 2319 }, { "epoch": 0.3173734610123119, - "grad_norm": 1.2392500007535006, + "grad_norm": 1.2525468932340877, "learning_rate": 7.71387729752366e-06, - "loss": 0.1891, + "loss": 0.1895, "step": 2320 }, { "epoch": 0.31751025991792065, - "grad_norm": 1.7513969811255294, + "grad_norm": 1.7198388738192831, "learning_rate": 7.712072290401728e-06, - "loss": 0.2715, + "loss": 0.269, "step": 2321 }, { "epoch": 0.3176470588235294, - "grad_norm": 1.3903703318781948, + "grad_norm": 1.3928214928077136, "learning_rate": 7.710266782362248e-06, - "loss": 0.2226, + "loss": 0.2219, "step": 2322 }, { "epoch": 0.31778385772913814, - "grad_norm": 1.7683183324101317, + "grad_norm": 1.6528402764352919, "learning_rate": 7.708460773738695e-06, - "loss": 0.2934, + "loss": 0.2863, "step": 2323 }, { "epoch": 0.31792065663474695, - "grad_norm": 1.5936156818566118, + "grad_norm": 1.5772813209359622, "learning_rate": 7.706654264864637e-06, - "loss": 0.2224, + "loss": 0.2217, "step": 2324 }, { "epoch": 0.3180574555403557, - "grad_norm": 1.2154551378556482, + "grad_norm": 1.186935642742246, "learning_rate": 7.704847256073738e-06, - "loss": 0.2115, + "loss": 0.2108, "step": 2325 }, { "epoch": 0.31819425444596444, - "grad_norm": 1.2233152543370855, + "grad_norm": 1.168809786639704, "learning_rate": 7.703039747699748e-06, - "loss": 0.1957, + "loss": 0.1933, "step": 2326 }, { "epoch": 0.3183310533515732, - "grad_norm": 1.424601814902084, + "grad_norm": 1.4312475219002143, "learning_rate": 7.701231740076511e-06, - "loss": 0.2038, + "loss": 0.2021, "step": 2327 }, { "epoch": 0.31846785225718194, - "grad_norm": 1.4469098486230891, + "grad_norm": 1.4524263207281418, "learning_rate": 7.699423233537969e-06, - "loss": 0.2062, + "loss": 0.2065, "step": 2328 }, { "epoch": 0.3186046511627907, - "grad_norm": 1.3572213558173558, + "grad_norm": 1.3572375463071016, "learning_rate": 7.697614228418149e-06, - "loss": 0.1975, + "loss": 0.1981, "step": 2329 }, { "epoch": 0.31874145006839943, - "grad_norm": 1.390562225548715, + "grad_norm": 1.3841748528069922, "learning_rate": 7.695804725051173e-06, - "loss": 0.23, + "loss": 0.2318, "step": 2330 }, { "epoch": 0.3188782489740082, - "grad_norm": 1.2369024120032077, + "grad_norm": 1.2139282660532662, "learning_rate": 7.693994723771254e-06, - "loss": 0.1977, + "loss": 0.1965, "step": 2331 }, { "epoch": 0.319015047879617, - "grad_norm": 1.1931102167520742, + "grad_norm": 1.1809020240675956, "learning_rate": 7.692184224912698e-06, - "loss": 0.1663, + "loss": 0.1658, "step": 2332 }, { "epoch": 0.31915184678522573, - "grad_norm": 1.3264854041269976, + "grad_norm": 1.3241299213701156, "learning_rate": 7.690373228809904e-06, - "loss": 0.2262, + "loss": 0.2253, "step": 2333 }, { "epoch": 0.3192886456908345, - "grad_norm": 1.3150448053809345, + "grad_norm": 1.3108830321187857, "learning_rate": 7.688561735797361e-06, - "loss": 0.2182, + "loss": 0.2189, "step": 2334 }, { "epoch": 0.31942544459644323, - "grad_norm": 1.6213595159952712, + "grad_norm": 1.6121851241030694, "learning_rate": 7.68674974620965e-06, - "loss": 0.2605, + "loss": 0.2619, "step": 2335 }, { "epoch": 0.319562243502052, - "grad_norm": 1.5742532578964412, + "grad_norm": 1.58470768141119, "learning_rate": 7.684937260381444e-06, - "loss": 0.252, + "loss": 0.2533, "step": 2336 }, { "epoch": 0.3196990424076607, - "grad_norm": 1.1571830307688409, + "grad_norm": 1.1654518134422018, "learning_rate": 7.683124278647509e-06, - "loss": 0.1587, + "loss": 0.1603, "step": 2337 }, { "epoch": 0.3198358413132695, - "grad_norm": 1.242204145198864, + "grad_norm": 1.2301851938147292, "learning_rate": 7.681310801342697e-06, - "loss": 0.1715, + "loss": 0.1711, "step": 2338 }, { "epoch": 0.3199726402188782, - "grad_norm": 1.3259205331779667, + "grad_norm": 1.3174974398384092, "learning_rate": 7.67949682880196e-06, - "loss": 0.197, + "loss": 0.1968, "step": 2339 }, { "epoch": 0.320109439124487, - "grad_norm": 1.6207994919567617, + "grad_norm": 1.6235197181543706, "learning_rate": 7.677682361360337e-06, - "loss": 0.2409, + "loss": 0.2412, "step": 2340 }, { "epoch": 0.3202462380300958, - "grad_norm": 1.4971741944967922, + "grad_norm": 1.5008627680022857, "learning_rate": 7.675867399352957e-06, - "loss": 0.2531, + "loss": 0.2549, "step": 2341 }, { "epoch": 0.3203830369357045, - "grad_norm": 1.8456926027412006, + "grad_norm": 1.7398621616489887, "learning_rate": 7.674051943115043e-06, - "loss": 0.2292, + "loss": 0.2247, "step": 2342 }, { "epoch": 0.32051983584131327, - "grad_norm": 1.5563535490591718, + "grad_norm": 1.4999556860024148, "learning_rate": 7.672235992981908e-06, - "loss": 0.2178, + "loss": 0.214, "step": 2343 }, { "epoch": 0.320656634746922, - "grad_norm": 1.4350053006212833, + "grad_norm": 1.4335827824856295, "learning_rate": 7.670419549288956e-06, - "loss": 0.2131, + "loss": 0.2155, "step": 2344 }, { "epoch": 0.32079343365253077, - "grad_norm": 1.5946453401232406, + "grad_norm": 1.5999549568768816, "learning_rate": 7.668602612371683e-06, - "loss": 0.2721, + "loss": 0.2739, "step": 2345 }, { "epoch": 0.3209302325581395, - "grad_norm": 1.3159141997298076, + "grad_norm": 1.2992694883803229, "learning_rate": 7.666785182565676e-06, - "loss": 0.1931, + "loss": 0.1912, "step": 2346 }, { "epoch": 0.32106703146374826, - "grad_norm": 1.1244340652450362, + "grad_norm": 1.124354291528411, "learning_rate": 7.664967260206614e-06, - "loss": 0.1765, + "loss": 0.176, "step": 2347 }, { "epoch": 0.32120383036935707, - "grad_norm": 1.2738671442712426, + "grad_norm": 1.2311908143000891, "learning_rate": 7.663148845630263e-06, - "loss": 0.1959, + "loss": 0.1921, "step": 2348 }, { "epoch": 0.3213406292749658, - "grad_norm": 1.4142633676115708, + "grad_norm": 1.4324539138059433, "learning_rate": 7.661329939172485e-06, - "loss": 0.2382, + "loss": 0.2389, "step": 2349 }, { "epoch": 0.32147742818057456, - "grad_norm": 1.1954144465493144, + "grad_norm": 1.185326372789419, "learning_rate": 7.659510541169229e-06, "loss": 0.1656, "step": 2350 }, { "epoch": 0.3216142270861833, - "grad_norm": 1.2672150349933609, + "grad_norm": 1.2517714790451928, "learning_rate": 7.657690651956539e-06, - "loss": 0.175, + "loss": 0.1742, "step": 2351 }, { "epoch": 0.32175102599179206, - "grad_norm": 1.4905072721206538, + "grad_norm": 1.3356646784131367, "learning_rate": 7.655870271870544e-06, - "loss": 0.2222, + "loss": 0.2177, "step": 2352 }, { "epoch": 0.3218878248974008, - "grad_norm": 1.36896718757399, + "grad_norm": 1.3562967777372006, "learning_rate": 7.654049401247466e-06, - "loss": 0.1873, + "loss": 0.1849, "step": 2353 }, { "epoch": 0.32202462380300956, - "grad_norm": 1.3192831150241866, + "grad_norm": 1.3069613933910422, "learning_rate": 7.652228040423623e-06, - "loss": 0.2351, + "loss": 0.2319, "step": 2354 }, { "epoch": 0.3221614227086183, - "grad_norm": 1.3839711713268168, + "grad_norm": 1.3870572708708973, "learning_rate": 7.650406189735416e-06, - "loss": 0.2292, + "loss": 0.2312, "step": 2355 }, { "epoch": 0.3222982216142271, - "grad_norm": 1.4124617491464742, + "grad_norm": 1.4084173517789613, "learning_rate": 7.648583849519336e-06, - "loss": 0.2322, + "loss": 0.2292, "step": 2356 }, { "epoch": 0.32243502051983586, - "grad_norm": 1.1665009791879575, + "grad_norm": 1.159919542059088, "learning_rate": 7.646761020111975e-06, - "loss": 0.1855, + "loss": 0.1836, "step": 2357 }, { "epoch": 0.3225718194254446, - "grad_norm": 1.2021788051843059, + "grad_norm": 1.181118480925038, "learning_rate": 7.644937701850002e-06, - "loss": 0.2116, + "loss": 0.2104, "step": 2358 }, { "epoch": 0.32270861833105335, - "grad_norm": 1.4875773647148136, + "grad_norm": 1.4911568956018597, "learning_rate": 7.643113895070187e-06, - "loss": 0.2229, + "loss": 0.2247, "step": 2359 }, { "epoch": 0.3228454172366621, - "grad_norm": 1.2330848283176628, + "grad_norm": 1.231274302631295, "learning_rate": 7.641289600109381e-06, - "loss": 0.1904, + "loss": 0.1893, "step": 2360 }, { "epoch": 0.32298221614227085, - "grad_norm": 1.3226147523456873, + "grad_norm": 1.3259564567837594, "learning_rate": 7.639464817304532e-06, - "loss": 0.183, + "loss": 0.1843, "step": 2361 }, { "epoch": 0.3231190150478796, - "grad_norm": 1.5716119572528566, + "grad_norm": 1.5623392860551224, "learning_rate": 7.637639546992677e-06, - "loss": 0.2623, + "loss": 0.2635, "step": 2362 }, { "epoch": 0.32325581395348835, - "grad_norm": 1.6163325973792098, + "grad_norm": 1.5848485780522872, "learning_rate": 7.635813789510943e-06, - "loss": 0.2178, + "loss": 0.2187, "step": 2363 }, { "epoch": 0.32339261285909715, - "grad_norm": 1.416622072075599, + "grad_norm": 1.4408749561982825, "learning_rate": 7.633987545196539e-06, - "loss": 0.2484, + "loss": 0.2473, "step": 2364 }, { "epoch": 0.3235294117647059, - "grad_norm": 1.243020191569483, + "grad_norm": 1.2113806096551583, "learning_rate": 7.63216081438678e-06, - "loss": 0.1897, + "loss": 0.1877, "step": 2365 }, { "epoch": 0.32366621067031465, - "grad_norm": 1.2405547554934377, + "grad_norm": 1.2355190593696435, "learning_rate": 7.630333597419055e-06, - "loss": 0.2066, + "loss": 0.205, "step": 2366 }, { "epoch": 0.3238030095759234, - "grad_norm": 1.300233374004401, + "grad_norm": 1.2660656104708279, "learning_rate": 7.6285058946308535e-06, - "loss": 0.1925, + "loss": 0.1908, "step": 2367 }, { "epoch": 0.32393980848153214, - "grad_norm": 1.13242576181959, + "grad_norm": 1.1337126199786176, "learning_rate": 7.626677706359749e-06, - "loss": 0.1746, + "loss": 0.1758, "step": 2368 }, { "epoch": 0.3240766073871409, - "grad_norm": 1.469446053484066, + "grad_norm": 1.4246303597476448, "learning_rate": 7.624849032943405e-06, - "loss": 0.1954, + "loss": 0.1945, "step": 2369 }, { "epoch": 0.32421340629274964, - "grad_norm": 1.3116313184966921, + "grad_norm": 1.2675175387484794, "learning_rate": 7.6230198747195795e-06, - "loss": 0.1657, + "loss": 0.166, "step": 2370 }, { "epoch": 0.3243502051983584, - "grad_norm": 1.4557389553352542, + "grad_norm": 1.4311285028418348, "learning_rate": 7.621190232026113e-06, - "loss": 0.2454, + "loss": 0.2448, "step": 2371 }, { "epoch": 0.3244870041039672, - "grad_norm": 1.356461290695527, + "grad_norm": 1.3281992871999468, "learning_rate": 7.619360105200941e-06, - "loss": 0.1957, + "loss": 0.193, "step": 2372 }, { "epoch": 0.32462380300957594, - "grad_norm": 1.2917355108449156, + "grad_norm": 1.2710774729541583, "learning_rate": 7.617529494582085e-06, - "loss": 0.2088, + "loss": 0.2096, "step": 2373 }, { "epoch": 0.3247606019151847, - "grad_norm": 1.3017965209951146, + "grad_norm": 1.2828072167247953, "learning_rate": 7.615698400507661e-06, - "loss": 0.1679, + "loss": 0.1677, "step": 2374 }, { "epoch": 0.32489740082079344, - "grad_norm": 1.6021261721088729, + "grad_norm": 1.6134490297926116, "learning_rate": 7.613866823315865e-06, - "loss": 0.2794, + "loss": 0.2803, "step": 2375 }, { "epoch": 0.3250341997264022, - "grad_norm": 1.560468658640075, + "grad_norm": 1.597203905286188, "learning_rate": 7.612034763344991e-06, - "loss": 0.2514, + "loss": 0.2492, "step": 2376 }, { "epoch": 0.32517099863201093, - "grad_norm": 1.3528226669184535, + "grad_norm": 1.324880410422617, "learning_rate": 7.6102022209334206e-06, - "loss": 0.2224, + "loss": 0.2213, "step": 2377 }, { "epoch": 0.3253077975376197, - "grad_norm": 1.3844036248359317, + "grad_norm": 1.3621852314957603, "learning_rate": 7.6083691964196205e-06, - "loss": 0.2025, + "loss": 0.2048, "step": 2378 }, { "epoch": 0.32544459644322843, - "grad_norm": 1.304140767377986, + "grad_norm": 1.2612248174977987, "learning_rate": 7.606535690142149e-06, - "loss": 0.1925, + "loss": 0.1953, "step": 2379 }, { "epoch": 0.32558139534883723, - "grad_norm": 1.4680840850614079, + "grad_norm": 1.4833434226657458, "learning_rate": 7.604701702439652e-06, - "loss": 0.2013, + "loss": 0.2024, "step": 2380 }, { "epoch": 0.325718194254446, - "grad_norm": 1.6076964281535246, + "grad_norm": 1.579178687455429, "learning_rate": 7.602867233650868e-06, - "loss": 0.2331, + "loss": 0.2348, "step": 2381 }, { "epoch": 0.32585499316005473, - "grad_norm": 1.1136516622950883, + "grad_norm": 1.111217608215006, "learning_rate": 7.601032284114621e-06, - "loss": 0.1742, + "loss": 0.1751, "step": 2382 }, { "epoch": 0.3259917920656635, - "grad_norm": 1.36934390907228, + "grad_norm": 1.3701125790286166, "learning_rate": 7.599196854169826e-06, - "loss": 0.2255, + "loss": 0.2267, "step": 2383 }, { "epoch": 0.3261285909712722, - "grad_norm": 1.204546470308812, + "grad_norm": 1.2009311421635276, "learning_rate": 7.597360944155483e-06, - "loss": 0.2015, + "loss": 0.1991, "step": 2384 }, { "epoch": 0.326265389876881, - "grad_norm": 1.2451980122280222, + "grad_norm": 1.2467942927136333, "learning_rate": 7.595524554410685e-06, - "loss": 0.2094, + "loss": 0.2166, "step": 2385 }, { "epoch": 0.3264021887824897, - "grad_norm": 1.191881510071836, + "grad_norm": 1.2012362385379094, "learning_rate": 7.593687685274609e-06, - "loss": 0.1675, + "loss": 0.1697, "step": 2386 }, { "epoch": 0.32653898768809847, - "grad_norm": 1.2582564133030816, + "grad_norm": 1.2302120666269059, "learning_rate": 7.591850337086527e-06, - "loss": 0.2237, + "loss": 0.2249, "step": 2387 }, { "epoch": 0.3266757865937073, - "grad_norm": 1.2368444644998264, + "grad_norm": 1.240934356816266, "learning_rate": 7.5900125101857925e-06, - "loss": 0.2117, + "loss": 0.2099, "step": 2388 }, { "epoch": 0.326812585499316, - "grad_norm": 1.3037207188161393, + "grad_norm": 1.281216068745884, "learning_rate": 7.588174204911853e-06, - "loss": 0.1999, + "loss": 0.2001, "step": 2389 }, { "epoch": 0.32694938440492477, - "grad_norm": 1.2836435065972909, + "grad_norm": 1.257421763173023, "learning_rate": 7.586335421604239e-06, - "loss": 0.1992, + "loss": 0.201, "step": 2390 }, { "epoch": 0.3270861833105335, - "grad_norm": 1.4514578672215181, + "grad_norm": 1.364337950214525, "learning_rate": 7.5844961606025754e-06, - "loss": 0.2414, + "loss": 0.2378, "step": 2391 }, { "epoch": 0.32722298221614227, - "grad_norm": 1.1253013492797679, + "grad_norm": 1.0817617265649484, "learning_rate": 7.582656422246573e-06, - "loss": 0.1642, + "loss": 0.1636, "step": 2392 }, { "epoch": 0.327359781121751, - "grad_norm": 1.4587211071000619, + "grad_norm": 1.4944246471942448, "learning_rate": 7.580816206876025e-06, - "loss": 0.2308, + "loss": 0.2307, "step": 2393 }, { "epoch": 0.32749658002735976, - "grad_norm": 1.5348424454647474, + "grad_norm": 1.5124023311140977, "learning_rate": 7.578975514830822e-06, - "loss": 0.2139, + "loss": 0.2156, "step": 2394 }, { "epoch": 0.3276333789329685, - "grad_norm": 1.178867949018058, + "grad_norm": 1.1314436047849457, "learning_rate": 7.577134346450938e-06, - "loss": 0.1828, + "loss": 0.1802, "step": 2395 }, { "epoch": 0.3277701778385773, - "grad_norm": 1.3618403435855726, + "grad_norm": 1.4555928913725422, "learning_rate": 7.575292702076432e-06, - "loss": 0.194, + "loss": 0.1953, "step": 2396 }, { "epoch": 0.32790697674418606, - "grad_norm": 1.3776774437202373, + "grad_norm": 1.3543277725006764, "learning_rate": 7.573450582047457e-06, - "loss": 0.2025, + "loss": 0.2006, "step": 2397 }, { "epoch": 0.3280437756497948, - "grad_norm": 1.0821052637612214, + "grad_norm": 1.064686747434404, "learning_rate": 7.5716079867042524e-06, - "loss": 0.174, + "loss": 0.1742, "step": 2398 }, { "epoch": 0.32818057455540356, - "grad_norm": 1.390672626270092, + "grad_norm": 1.368557706014093, "learning_rate": 7.569764916387141e-06, - "loss": 0.2334, + "loss": 0.231, "step": 2399 }, { "epoch": 0.3283173734610123, - "grad_norm": 1.365466915781266, + "grad_norm": 1.3485345850094779, "learning_rate": 7.5679213714365385e-06, - "loss": 0.2143, + "loss": 0.2164, "step": 2400 }, { "epoch": 0.3283173734610123, - "eval_loss": 0.20002222061157227, - "eval_runtime": 5.9359, - "eval_samples_per_second": 5.054, - "eval_steps_per_second": 1.348, + "eval_loss": 0.20060497522354126, + "eval_runtime": 5.948, + "eval_samples_per_second": 5.044, + "eval_steps_per_second": 1.345, "step": 2400 }, { "epoch": 0.32845417236662106, - "grad_norm": 1.0452936373947843, + "grad_norm": 1.0401600572489051, "learning_rate": 7.566077352192944e-06, - "loss": 0.1519, + "loss": 0.1528, "step": 2401 }, { "epoch": 0.3285909712722298, - "grad_norm": 1.2145451213257226, + "grad_norm": 1.212347697379458, "learning_rate": 7.564232858996949e-06, - "loss": 0.2173, + "loss": 0.2158, "step": 2402 }, { "epoch": 0.32872777017783855, - "grad_norm": 1.2111277633963544, + "grad_norm": 1.1914499592197012, "learning_rate": 7.562387892189228e-06, - "loss": 0.2171, + "loss": 0.2174, "step": 2403 }, { "epoch": 0.32886456908344736, - "grad_norm": 1.2804953288527026, + "grad_norm": 1.2397597196189396, "learning_rate": 7.560542452110546e-06, - "loss": 0.228, + "loss": 0.2263, "step": 2404 }, { "epoch": 0.3290013679890561, - "grad_norm": 1.5186405339176154, + "grad_norm": 1.5013797643655937, "learning_rate": 7.558696539101753e-06, - "loss": 0.2123, + "loss": 0.2109, "step": 2405 }, { "epoch": 0.32913816689466485, - "grad_norm": 1.2914538467486905, + "grad_norm": 1.2723234885738504, "learning_rate": 7.556850153503788e-06, - "loss": 0.1878, + "loss": 0.1861, "step": 2406 }, { "epoch": 0.3292749658002736, - "grad_norm": 1.3998958121693337, + "grad_norm": 1.4176090499190284, "learning_rate": 7.555003295657678e-06, - "loss": 0.2584, + "loss": 0.2611, "step": 2407 }, { "epoch": 0.32941176470588235, - "grad_norm": 1.3074725177586333, + "grad_norm": 1.2853785947078358, "learning_rate": 7.553155965904535e-06, - "loss": 0.2029, + "loss": 0.2033, "step": 2408 }, { "epoch": 0.3295485636114911, - "grad_norm": 1.10320171104974, + "grad_norm": 1.1109166226911493, "learning_rate": 7.551308164585561e-06, - "loss": 0.1491, + "loss": 0.1493, "step": 2409 }, { "epoch": 0.32968536251709984, - "grad_norm": 1.263382423497924, + "grad_norm": 1.2336634105377637, "learning_rate": 7.549459892042042e-06, - "loss": 0.2256, + "loss": 0.2261, "step": 2410 }, { "epoch": 0.3298221614227086, - "grad_norm": 1.4328981214292413, + "grad_norm": 1.2776219922448773, "learning_rate": 7.5476111486153505e-06, - "loss": 0.2057, + "loss": 0.1988, "step": 2411 }, { "epoch": 0.3299589603283174, - "grad_norm": 1.4214675461179336, + "grad_norm": 1.4155265349352948, "learning_rate": 7.54576193464695e-06, - "loss": 0.2613, + "loss": 0.2621, "step": 2412 }, { "epoch": 0.33009575923392614, - "grad_norm": 1.2065475087109623, + "grad_norm": 1.1961698242479348, "learning_rate": 7.5439122504783916e-06, - "loss": 0.1884, + "loss": 0.1885, "step": 2413 }, { "epoch": 0.3302325581395349, - "grad_norm": 1.1637769081664195, + "grad_norm": 1.1662990005374754, "learning_rate": 7.542062096451306e-06, - "loss": 0.2105, + "loss": 0.2117, "step": 2414 }, { "epoch": 0.33036935704514364, - "grad_norm": 1.4007849160591055, + "grad_norm": 1.3866266576580593, "learning_rate": 7.5402114729074174e-06, - "loss": 0.1917, + "loss": 0.1897, "step": 2415 }, { "epoch": 0.3305061559507524, - "grad_norm": 1.2469489221754175, + "grad_norm": 1.235281035294348, "learning_rate": 7.538360380188532e-06, - "loss": 0.2179, + "loss": 0.2174, "step": 2416 }, { "epoch": 0.33064295485636114, - "grad_norm": 1.2163681753960975, + "grad_norm": 1.2156524988167816, "learning_rate": 7.53650881863655e-06, - "loss": 0.1794, + "loss": 0.1782, "step": 2417 }, { "epoch": 0.3307797537619699, - "grad_norm": 1.108307387352073, + "grad_norm": 1.1004742910837928, "learning_rate": 7.534656788593446e-06, - "loss": 0.1621, + "loss": 0.1614, "step": 2418 }, { "epoch": 0.33091655266757863, - "grad_norm": 1.4540789911327698, + "grad_norm": 1.4180340466593069, "learning_rate": 7.5328042904012965e-06, - "loss": 0.2251, + "loss": 0.2216, "step": 2419 }, { "epoch": 0.33105335157318744, - "grad_norm": 1.545555827039381, + "grad_norm": 1.5271782625404442, "learning_rate": 7.530951324402251e-06, - "loss": 0.2269, + "loss": 0.2273, "step": 2420 }, { "epoch": 0.3311901504787962, - "grad_norm": 1.2316319150372361, + "grad_norm": 1.249899446532276, "learning_rate": 7.5290978909385525e-06, - "loss": 0.1966, + "loss": 0.1981, "step": 2421 }, { "epoch": 0.33132694938440493, - "grad_norm": 0.9287901014551593, + "grad_norm": 0.9347151513871643, "learning_rate": 7.527243990352529e-06, - "loss": 0.1587, + "loss": 0.16, "step": 2422 }, { "epoch": 0.3314637482900137, - "grad_norm": 1.281218470310965, + "grad_norm": 1.2753282091754872, "learning_rate": 7.525389622986595e-06, - "loss": 0.1766, + "loss": 0.1746, "step": 2423 }, { "epoch": 0.33160054719562243, - "grad_norm": 1.3327126591502627, + "grad_norm": 1.3055086303085277, "learning_rate": 7.523534789183249e-06, - "loss": 0.2263, + "loss": 0.2247, "step": 2424 }, { "epoch": 0.3317373461012312, - "grad_norm": 1.1602223120201198, + "grad_norm": 1.1362612316255676, "learning_rate": 7.521679489285079e-06, - "loss": 0.1807, + "loss": 0.1806, "step": 2425 }, { "epoch": 0.3318741450068399, - "grad_norm": 1.4171463557993402, + "grad_norm": 1.40934731408682, "learning_rate": 7.519823723634754e-06, - "loss": 0.2049, + "loss": 0.2061, "step": 2426 }, { "epoch": 0.3320109439124487, - "grad_norm": 1.1464782169756182, + "grad_norm": 1.1352932520793513, "learning_rate": 7.5179674925750355e-06, - "loss": 0.1917, + "loss": 0.1916, "step": 2427 }, { "epoch": 0.3321477428180575, - "grad_norm": 1.0687644682304684, + "grad_norm": 1.0590460938139772, "learning_rate": 7.516110796448768e-06, - "loss": 0.1763, + "loss": 0.1775, "step": 2428 }, { "epoch": 0.3322845417236662, - "grad_norm": 1.2055381747217722, + "grad_norm": 1.2128366327747775, "learning_rate": 7.51425363559888e-06, - "loss": 0.2032, + "loss": 0.2031, "step": 2429 }, { "epoch": 0.332421340629275, - "grad_norm": 1.3338379178648434, + "grad_norm": 1.3062988127005954, "learning_rate": 7.51239601036839e-06, - "loss": 0.2207, + "loss": 0.2195, "step": 2430 }, { "epoch": 0.3325581395348837, - "grad_norm": 1.199331611907281, + "grad_norm": 1.2126675913976963, "learning_rate": 7.510537921100398e-06, - "loss": 0.1927, + "loss": 0.196, "step": 2431 }, { "epoch": 0.33269493844049247, - "grad_norm": 1.1952026386634997, + "grad_norm": 1.1699726299955515, "learning_rate": 7.508679368138091e-06, - "loss": 0.2003, + "loss": 0.1997, "step": 2432 }, { "epoch": 0.3328317373461012, - "grad_norm": 1.2110669885681737, + "grad_norm": 1.1883563512119848, "learning_rate": 7.506820351824743e-06, - "loss": 0.1989, + "loss": 0.1999, "step": 2433 }, { "epoch": 0.33296853625170997, - "grad_norm": 1.369561196973815, + "grad_norm": 1.35082059508123, "learning_rate": 7.504960872503715e-06, - "loss": 0.2089, + "loss": 0.2117, "step": 2434 }, { "epoch": 0.3331053351573187, - "grad_norm": 1.5057983054862047, + "grad_norm": 1.469725181125946, "learning_rate": 7.503100930518448e-06, - "loss": 0.2408, + "loss": 0.2403, "step": 2435 }, { "epoch": 0.3332421340629275, - "grad_norm": 1.2387407374696504, + "grad_norm": 1.2643273089525284, "learning_rate": 7.501240526212472e-06, - "loss": 0.2033, + "loss": 0.2068, "step": 2436 }, { "epoch": 0.33337893296853627, - "grad_norm": 1.1630526031478838, + "grad_norm": 1.1339808813813517, "learning_rate": 7.499379659929404e-06, - "loss": 0.1823, + "loss": 0.18, "step": 2437 }, { "epoch": 0.333515731874145, - "grad_norm": 1.1960375662083138, + "grad_norm": 1.1616951363788721, "learning_rate": 7.497518332012945e-06, - "loss": 0.2041, + "loss": 0.2071, "step": 2438 }, { "epoch": 0.33365253077975376, - "grad_norm": 1.2973584016195119, + "grad_norm": 1.299175654897163, "learning_rate": 7.495656542806879e-06, - "loss": 0.2151, + "loss": 0.2189, "step": 2439 }, { "epoch": 0.3337893296853625, - "grad_norm": 1.2908251864445648, + "grad_norm": 1.247626953980117, "learning_rate": 7.493794292655077e-06, - "loss": 0.1885, + "loss": 0.1886, "step": 2440 }, { "epoch": 0.33392612859097126, - "grad_norm": 1.1168002165814628, + "grad_norm": 1.0895053502584062, "learning_rate": 7.491931581901495e-06, - "loss": 0.2075, + "loss": 0.2088, "step": 2441 }, { "epoch": 0.33406292749658, - "grad_norm": 1.3838735444891395, + "grad_norm": 1.3544121909081073, "learning_rate": 7.490068410890174e-06, - "loss": 0.2087, + "loss": 0.2053, "step": 2442 }, { "epoch": 0.33419972640218876, - "grad_norm": 1.3599987589127762, + "grad_norm": 1.360711955622392, "learning_rate": 7.488204779965243e-06, - "loss": 0.2319, + "loss": 0.2337, "step": 2443 }, { "epoch": 0.33433652530779756, - "grad_norm": 1.4968645286509596, + "grad_norm": 1.5074042772675633, "learning_rate": 7.48634068947091e-06, - "loss": 0.1973, + "loss": 0.1982, "step": 2444 }, { "epoch": 0.3344733242134063, - "grad_norm": 1.383521123326907, + "grad_norm": 1.3666716395285587, "learning_rate": 7.484476139751472e-06, - "loss": 0.2013, + "loss": 0.2018, "step": 2445 }, { "epoch": 0.33461012311901506, - "grad_norm": 1.3768267508265635, + "grad_norm": 1.361267709216795, "learning_rate": 7.4826111311513106e-06, - "loss": 0.1817, + "loss": 0.1851, "step": 2446 }, { "epoch": 0.3347469220246238, - "grad_norm": 1.2496674591845205, + "grad_norm": 1.2063212624648632, "learning_rate": 7.48074566401489e-06, - "loss": 0.2104, + "loss": 0.204, "step": 2447 }, { "epoch": 0.33488372093023255, - "grad_norm": 1.5360636003547334, + "grad_norm": 1.5175310349922908, "learning_rate": 7.4788797386867596e-06, - "loss": 0.2139, + "loss": 0.2133, "step": 2448 }, { "epoch": 0.3350205198358413, - "grad_norm": 1.68646230043897, + "grad_norm": 1.6152470329315705, "learning_rate": 7.477013355511558e-06, - "loss": 0.2697, + "loss": 0.2753, "step": 2449 }, { "epoch": 0.33515731874145005, - "grad_norm": 1.3702871241643988, + "grad_norm": 1.3590675536517418, "learning_rate": 7.475146514834002e-06, - "loss": 0.1967, + "loss": 0.1951, "step": 2450 }, { "epoch": 0.3352941176470588, - "grad_norm": 1.5045286991991038, + "grad_norm": 1.455530366174726, "learning_rate": 7.473279216998896e-06, - "loss": 0.197, + "loss": 0.1941, "step": 2451 }, { "epoch": 0.3354309165526676, - "grad_norm": 1.3181273719515267, + "grad_norm": 1.2844530660102629, "learning_rate": 7.471411462351126e-06, - "loss": 0.1941, + "loss": 0.1969, "step": 2452 }, { "epoch": 0.33556771545827635, - "grad_norm": 1.1184629704513867, + "grad_norm": 1.0882134102141152, "learning_rate": 7.469543251235669e-06, - "loss": 0.1668, + "loss": 0.1637, "step": 2453 }, { "epoch": 0.3357045143638851, - "grad_norm": 1.4277262409463831, + "grad_norm": 1.3927439574886604, "learning_rate": 7.467674583997581e-06, - "loss": 0.2163, + "loss": 0.2141, "step": 2454 }, { "epoch": 0.33584131326949385, - "grad_norm": 1.228976697269819, + "grad_norm": 1.2259745919680265, "learning_rate": 7.465805460982002e-06, - "loss": 0.1924, + "loss": 0.1902, "step": 2455 }, { "epoch": 0.3359781121751026, - "grad_norm": 1.275186060253838, + "grad_norm": 1.2962400611111675, "learning_rate": 7.463935882534157e-06, - "loss": 0.2127, + "loss": 0.2143, "step": 2456 }, { "epoch": 0.33611491108071134, - "grad_norm": 1.2812119918464264, + "grad_norm": 1.262576668348408, "learning_rate": 7.462065848999357e-06, - "loss": 0.1755, + "loss": 0.1762, "step": 2457 }, { "epoch": 0.3362517099863201, - "grad_norm": 1.137554415411932, + "grad_norm": 1.1385119955694938, "learning_rate": 7.460195360722996e-06, - "loss": 0.1958, + "loss": 0.1942, "step": 2458 }, { "epoch": 0.33638850889192884, - "grad_norm": 1.2755570469252258, + "grad_norm": 1.2631959713508036, "learning_rate": 7.458324418050551e-06, - "loss": 0.1854, + "loss": 0.1865, "step": 2459 }, { "epoch": 0.33652530779753764, - "grad_norm": 1.5410150307009784, + "grad_norm": 1.5210087891862683, "learning_rate": 7.456453021327582e-06, - "loss": 0.2447, + "loss": 0.2455, "step": 2460 }, { "epoch": 0.3366621067031464, - "grad_norm": 1.1988204867320265, + "grad_norm": 1.1875688130540234, "learning_rate": 7.454581170899738e-06, - "loss": 0.199, + "loss": 0.1979, "step": 2461 }, { "epoch": 0.33679890560875514, - "grad_norm": 1.271029517613098, + "grad_norm": 1.2468148127267904, "learning_rate": 7.452708867112744e-06, - "loss": 0.2124, + "loss": 0.2118, "step": 2462 }, { "epoch": 0.3369357045143639, - "grad_norm": 1.4504682695007354, + "grad_norm": 1.367723088530158, "learning_rate": 7.450836110312416e-06, - "loss": 0.2344, + "loss": 0.2307, "step": 2463 }, { "epoch": 0.33707250341997264, - "grad_norm": 1.4233639528236217, + "grad_norm": 1.4246388538127732, "learning_rate": 7.448962900844652e-06, - "loss": 0.2535, + "loss": 0.2552, "step": 2464 }, { "epoch": 0.3372093023255814, - "grad_norm": 1.4064854710970651, + "grad_norm": 1.3937256311761106, "learning_rate": 7.447089239055428e-06, - "loss": 0.1924, + "loss": 0.1948, "step": 2465 }, { "epoch": 0.33734610123119013, - "grad_norm": 1.2225040313041609, + "grad_norm": 1.2062610005638341, "learning_rate": 7.445215125290811e-06, - "loss": 0.1882, + "loss": 0.1868, "step": 2466 }, { "epoch": 0.3374829001367989, - "grad_norm": 1.2929107327637863, + "grad_norm": 1.2873579777548982, "learning_rate": 7.4433405598969476e-06, - "loss": 0.2066, + "loss": 0.2077, "step": 2467 }, { "epoch": 0.3376196990424077, - "grad_norm": 1.0919752669009872, + "grad_norm": 1.1008742908936202, "learning_rate": 7.441465543220067e-06, - "loss": 0.165, + "loss": 0.1693, "step": 2468 }, { "epoch": 0.33775649794801643, - "grad_norm": 1.3051121875268177, + "grad_norm": 1.269299585789646, "learning_rate": 7.4395900756064845e-06, - "loss": 0.1895, + "loss": 0.1932, "step": 2469 }, { "epoch": 0.3378932968536252, - "grad_norm": 0.9834183563875619, + "grad_norm": 0.9728175340894378, "learning_rate": 7.437714157402598e-06, - "loss": 0.1626, + "loss": 0.1601, "step": 2470 }, { "epoch": 0.33803009575923393, - "grad_norm": 1.4020813997970918, + "grad_norm": 1.4296295094096667, "learning_rate": 7.435837788954887e-06, - "loss": 0.2243, + "loss": 0.2256, "step": 2471 }, { "epoch": 0.3381668946648427, - "grad_norm": 1.1312368815182332, + "grad_norm": 1.1220726326992632, "learning_rate": 7.433960970609917e-06, - "loss": 0.1793, + "loss": 0.1775, "step": 2472 }, { "epoch": 0.3383036935704514, - "grad_norm": 1.2910750925825385, + "grad_norm": 1.2989592726211479, "learning_rate": 7.4320837027143324e-06, - "loss": 0.218, + "loss": 0.2184, "step": 2473 }, { "epoch": 0.3384404924760602, - "grad_norm": 1.7475463779649063, + "grad_norm": 1.717375114572402, "learning_rate": 7.430205985614864e-06, - "loss": 0.1916, + "loss": 0.1924, "step": 2474 }, { "epoch": 0.3385772913816689, - "grad_norm": 1.7196345031292362, + "grad_norm": 1.7157065334077712, "learning_rate": 7.428327819658325e-06, - "loss": 0.2152, + "loss": 0.2136, "step": 2475 }, { "epoch": 0.3387140902872777, - "grad_norm": 1.2916463876377042, + "grad_norm": 1.265372617566274, "learning_rate": 7.42644920519161e-06, - "loss": 0.1852, + "loss": 0.1856, "step": 2476 }, { "epoch": 0.3388508891928865, - "grad_norm": 1.4692679928787142, + "grad_norm": 1.4761201702122535, "learning_rate": 7.4245701425617e-06, - "loss": 0.2269, + "loss": 0.2268, "step": 2477 }, { "epoch": 0.3389876880984952, - "grad_norm": 1.5684718300365637, + "grad_norm": 1.5539157124855647, "learning_rate": 7.422690632115655e-06, - "loss": 0.2026, + "loss": 0.2043, "step": 2478 }, { "epoch": 0.33912448700410397, - "grad_norm": 1.4473915199720266, + "grad_norm": 1.4259143922856712, "learning_rate": 7.420810674200617e-06, - "loss": 0.2169, + "loss": 0.2174, "step": 2479 }, { "epoch": 0.3392612859097127, - "grad_norm": 1.3563467996900562, + "grad_norm": 1.3439845476071413, "learning_rate": 7.418930269163815e-06, - "loss": 0.2325, + "loss": 0.2303, "step": 2480 }, { "epoch": 0.33939808481532147, - "grad_norm": 1.0217102048535447, + "grad_norm": 1.0111382439963086, "learning_rate": 7.41704941735256e-06, - "loss": 0.1682, + "loss": 0.1679, "step": 2481 }, { "epoch": 0.3395348837209302, - "grad_norm": 1.1701107737737337, + "grad_norm": 1.147407453315033, "learning_rate": 7.41516811911424e-06, "loss": 0.1995, "step": 2482 }, { "epoch": 0.33967168262653896, - "grad_norm": 1.2860282633922544, + "grad_norm": 1.2592149004795656, "learning_rate": 7.413286374796331e-06, - "loss": 0.1785, + "loss": 0.1782, "step": 2483 }, { "epoch": 0.33980848153214777, - "grad_norm": 1.1381215539789622, + "grad_norm": 1.1060203804690296, "learning_rate": 7.411404184746389e-06, - "loss": 0.1785, + "loss": 0.1742, "step": 2484 }, { "epoch": 0.3399452804377565, - "grad_norm": 1.1989211393262518, + "grad_norm": 1.188910494793565, "learning_rate": 7.409521549312053e-06, - "loss": 0.214, + "loss": 0.2148, "step": 2485 }, { "epoch": 0.34008207934336526, - "grad_norm": 1.590684189653709, + "grad_norm": 1.5670687253767859, "learning_rate": 7.407638468841047e-06, - "loss": 0.2042, + "loss": 0.2035, "step": 2486 }, { "epoch": 0.340218878248974, - "grad_norm": 1.522465473676377, + "grad_norm": 1.4488778692312623, "learning_rate": 7.405754943681171e-06, - "loss": 0.2416, + "loss": 0.2389, "step": 2487 }, { "epoch": 0.34035567715458276, - "grad_norm": 1.366591512829187, + "grad_norm": 1.3259159774827314, "learning_rate": 7.4038709741803125e-06, - "loss": 0.2074, + "loss": 0.2078, "step": 2488 }, { "epoch": 0.3404924760601915, - "grad_norm": 1.258641730282592, + "grad_norm": 1.2544120183126624, "learning_rate": 7.401986560686438e-06, - "loss": 0.2217, + "loss": 0.2214, "step": 2489 }, { "epoch": 0.34062927496580025, - "grad_norm": 1.482519675747175, + "grad_norm": 1.4721637983591662, "learning_rate": 7.400101703547597e-06, - "loss": 0.2349, + "loss": 0.2356, "step": 2490 }, { "epoch": 0.340766073871409, - "grad_norm": 1.4107889438706656, + "grad_norm": 1.388935786372199, "learning_rate": 7.398216403111922e-06, - "loss": 0.2038, + "loss": 0.2067, "step": 2491 }, { "epoch": 0.3409028727770178, - "grad_norm": 1.4015410050343866, + "grad_norm": 1.4268388806378836, "learning_rate": 7.396330659727626e-06, - "loss": 0.2141, + "loss": 0.2172, "step": 2492 }, { "epoch": 0.34103967168262656, - "grad_norm": 1.2877849707730729, + "grad_norm": 1.2827679747301464, "learning_rate": 7.3944444737430035e-06, - "loss": 0.216, + "loss": 0.2174, "step": 2493 }, { "epoch": 0.3411764705882353, - "grad_norm": 1.305895541061173, + "grad_norm": 1.3334274082306843, "learning_rate": 7.392557845506433e-06, - "loss": 0.2024, + "loss": 0.2049, "step": 2494 }, { "epoch": 0.34131326949384405, - "grad_norm": 1.487660421218672, + "grad_norm": 1.467925014391985, "learning_rate": 7.390670775366373e-06, - "loss": 0.2462, + "loss": 0.246, "step": 2495 }, { "epoch": 0.3414500683994528, - "grad_norm": 1.4480303503640297, + "grad_norm": 1.4276980665860366, "learning_rate": 7.3887832636713645e-06, - "loss": 0.2809, + "loss": 0.2752, "step": 2496 }, { "epoch": 0.34158686730506155, - "grad_norm": 1.3501180223946254, + "grad_norm": 1.3097750122163576, "learning_rate": 7.3868953107700255e-06, - "loss": 0.2212, + "loss": 0.2152, "step": 2497 }, { "epoch": 0.3417236662106703, - "grad_norm": 1.4273439080301322, + "grad_norm": 1.4192297824910118, "learning_rate": 7.385006917011064e-06, - "loss": 0.2238, + "loss": 0.2262, "step": 2498 }, { "epoch": 0.34186046511627904, - "grad_norm": 1.5265471274581248, + "grad_norm": 1.5022104295844556, "learning_rate": 7.383118082743263e-06, - "loss": 0.228, + "loss": 0.2289, "step": 2499 }, { "epoch": 0.34199726402188785, - "grad_norm": 1.5096996107353025, + "grad_norm": 1.531154799856418, "learning_rate": 7.381228808315486e-06, - "loss": 0.2095, + "loss": 0.211, "step": 2500 }, { "epoch": 0.34199726402188785, - "eval_loss": 0.1963818371295929, - "eval_runtime": 5.9206, - "eval_samples_per_second": 5.067, - "eval_steps_per_second": 1.351, + "eval_loss": 0.1958075314760208, + "eval_runtime": 5.9113, + "eval_samples_per_second": 5.075, + "eval_steps_per_second": 1.353, "step": 2500 }, { "epoch": 0.3421340629274966, - "grad_norm": 1.5378780025816297, + "grad_norm": 1.536560084453379, "learning_rate": 7.379339094076685e-06, - "loss": 0.2295, + "loss": 0.228, "step": 2501 }, { "epoch": 0.34227086183310534, - "grad_norm": 1.2676680264106892, + "grad_norm": 1.2640741835681832, "learning_rate": 7.377448940375888e-06, - "loss": 0.2142, + "loss": 0.214, "step": 2502 }, { "epoch": 0.3424076607387141, - "grad_norm": 1.2064592732433417, + "grad_norm": 1.196181743505938, "learning_rate": 7.375558347562202e-06, - "loss": 0.1958, + "loss": 0.1942, "step": 2503 }, { "epoch": 0.34254445964432284, - "grad_norm": 1.476044155719269, + "grad_norm": 1.432874985026217, "learning_rate": 7.37366731598482e-06, - "loss": 0.2404, + "loss": 0.2401, "step": 2504 }, { "epoch": 0.3426812585499316, - "grad_norm": 1.1185247311224469, + "grad_norm": 1.1018463389815272, "learning_rate": 7.371775845993015e-06, - "loss": 0.1932, + "loss": 0.1916, "step": 2505 }, { "epoch": 0.34281805745554034, - "grad_norm": 1.2712126437575504, + "grad_norm": 1.2580831117411735, "learning_rate": 7.369883937936136e-06, - "loss": 0.1821, + "loss": 0.1844, "step": 2506 }, { "epoch": 0.3429548563611491, - "grad_norm": 1.5781657272012475, + "grad_norm": 1.5793789297215393, "learning_rate": 7.367991592163619e-06, - "loss": 0.2011, + "loss": 0.2041, "step": 2507 }, { "epoch": 0.3430916552667579, - "grad_norm": 1.0503998936068721, + "grad_norm": 1.0240818167313654, "learning_rate": 7.36609880902498e-06, - "loss": 0.1524, + "loss": 0.1522, "step": 2508 }, { "epoch": 0.34322845417236664, - "grad_norm": 1.533658777141672, + "grad_norm": 1.5130666394803371, "learning_rate": 7.364205588869815e-06, - "loss": 0.2079, + "loss": 0.2094, "step": 2509 }, { "epoch": 0.3433652530779754, - "grad_norm": 1.316068695361997, + "grad_norm": 1.312481048381423, "learning_rate": 7.3623119320477975e-06, - "loss": 0.1712, + "loss": 0.1706, "step": 2510 }, { "epoch": 0.34350205198358413, - "grad_norm": 1.4766692645673527, + "grad_norm": 1.4784791757240228, "learning_rate": 7.360417838908686e-06, - "loss": 0.253, + "loss": 0.2553, "step": 2511 }, { "epoch": 0.3436388508891929, - "grad_norm": 1.306285952395999, + "grad_norm": 1.295074355154343, "learning_rate": 7.358523309802318e-06, - "loss": 0.1887, + "loss": 0.1872, "step": 2512 }, { "epoch": 0.34377564979480163, - "grad_norm": 1.3146703540048386, + "grad_norm": 1.3092137538205089, "learning_rate": 7.356628345078611e-06, "loss": 0.2158, "step": 2513 }, { "epoch": 0.3439124487004104, - "grad_norm": 1.2266205166769266, + "grad_norm": 1.2487970664242958, "learning_rate": 7.3547329450875625e-06, - "loss": 0.188, + "loss": 0.1888, "step": 2514 }, { "epoch": 0.3440492476060191, - "grad_norm": 1.4595189055320945, + "grad_norm": 1.4064434046964969, "learning_rate": 7.352837110179254e-06, - "loss": 0.2541, + "loss": 0.2503, "step": 2515 }, { "epoch": 0.34418604651162793, - "grad_norm": 1.4772141197638466, + "grad_norm": 1.499579858594314, "learning_rate": 7.350940840703842e-06, - "loss": 0.2081, + "loss": 0.2125, "step": 2516 }, { "epoch": 0.3443228454172367, - "grad_norm": 1.5025250426525472, + "grad_norm": 1.455913287970751, "learning_rate": 7.349044137011567e-06, - "loss": 0.2713, + "loss": 0.2693, "step": 2517 }, { "epoch": 0.3444596443228454, - "grad_norm": 1.647767948376109, + "grad_norm": 1.6198121764998326, "learning_rate": 7.34714699945275e-06, - "loss": 0.2624, + "loss": 0.2594, "step": 2518 }, { "epoch": 0.3445964432284542, - "grad_norm": 1.325342292609325, + "grad_norm": 1.2954280973306673, "learning_rate": 7.345249428377788e-06, - "loss": 0.214, + "loss": 0.2125, "step": 2519 }, { "epoch": 0.3447332421340629, - "grad_norm": 1.1470028898683702, + "grad_norm": 1.1143008124606157, "learning_rate": 7.343351424137164e-06, - "loss": 0.2039, + "loss": 0.2018, "step": 2520 }, { "epoch": 0.34487004103967167, - "grad_norm": 1.4546990723888953, + "grad_norm": 1.4513694907795651, "learning_rate": 7.341452987081434e-06, - "loss": 0.2214, + "loss": 0.2207, "step": 2521 }, { "epoch": 0.3450068399452804, - "grad_norm": 1.670177721468268, + "grad_norm": 1.6554191597864896, "learning_rate": 7.339554117561241e-06, - "loss": 0.2208, + "loss": 0.2203, "step": 2522 }, { "epoch": 0.34514363885088917, - "grad_norm": 1.3133782846249837, + "grad_norm": 1.3015925858092845, "learning_rate": 7.337654815927303e-06, - "loss": 0.1857, + "loss": 0.1882, "step": 2523 }, { "epoch": 0.34528043775649797, - "grad_norm": 1.3488242068294412, + "grad_norm": 1.3416983175133954, "learning_rate": 7.33575508253042e-06, - "loss": 0.1923, + "loss": 0.1924, "step": 2524 }, { "epoch": 0.3454172366621067, - "grad_norm": 1.230382314363307, + "grad_norm": 1.2295591936129748, "learning_rate": 7.333854917721472e-06, - "loss": 0.2125, + "loss": 0.2132, "step": 2525 }, { "epoch": 0.34555403556771547, - "grad_norm": 1.2114027760547406, + "grad_norm": 1.1765159851903333, "learning_rate": 7.331954321851418e-06, - "loss": 0.2188, + "loss": 0.2147, "step": 2526 }, { "epoch": 0.3456908344733242, - "grad_norm": 1.3029343617757683, + "grad_norm": 1.2799699615993627, "learning_rate": 7.330053295271294e-06, - "loss": 0.2281, + "loss": 0.2266, "step": 2527 }, { "epoch": 0.34582763337893296, - "grad_norm": 1.5452333086801981, + "grad_norm": 1.4258630110806951, "learning_rate": 7.32815183833222e-06, - "loss": 0.1979, + "loss": 0.1918, "step": 2528 }, { "epoch": 0.3459644322845417, - "grad_norm": 1.4311080390411717, + "grad_norm": 1.4141414873284157, "learning_rate": 7.326249951385392e-06, - "loss": 0.1927, + "loss": 0.1925, "step": 2529 }, { "epoch": 0.34610123119015046, - "grad_norm": 1.188427040354449, + "grad_norm": 1.1961263109894682, "learning_rate": 7.324347634782091e-06, - "loss": 0.2174, + "loss": 0.2181, "step": 2530 }, { "epoch": 0.3462380300957592, - "grad_norm": 1.0610705008034167, + "grad_norm": 1.041749360772809, "learning_rate": 7.3224448888736685e-06, - "loss": 0.1499, + "loss": 0.1483, "step": 2531 }, { "epoch": 0.346374829001368, - "grad_norm": 1.2789783585001484, + "grad_norm": 1.2722392250651542, "learning_rate": 7.320541714011562e-06, - "loss": 0.2011, + "loss": 0.201, "step": 2532 }, { "epoch": 0.34651162790697676, - "grad_norm": 1.2577971239153063, + "grad_norm": 1.2471379923545383, "learning_rate": 7.318638110547288e-06, - "loss": 0.1899, + "loss": 0.1889, "step": 2533 }, { "epoch": 0.3466484268125855, - "grad_norm": 1.438699626898349, + "grad_norm": 1.4347575570650641, "learning_rate": 7.316734078832439e-06, - "loss": 0.2275, + "loss": 0.2296, "step": 2534 }, { "epoch": 0.34678522571819426, - "grad_norm": 1.251227312854788, + "grad_norm": 1.2209567601184275, "learning_rate": 7.3148296192186885e-06, - "loss": 0.1544, + "loss": 0.152, "step": 2535 }, { "epoch": 0.346922024623803, - "grad_norm": 1.2188454724720152, + "grad_norm": 1.1858795218718283, "learning_rate": 7.312924732057786e-06, - "loss": 0.1918, + "loss": 0.192, "step": 2536 }, { "epoch": 0.34705882352941175, - "grad_norm": 1.1862742560219683, + "grad_norm": 1.1748119679361082, "learning_rate": 7.311019417701567e-06, - "loss": 0.199, + "loss": 0.1984, "step": 2537 }, { "epoch": 0.3471956224350205, - "grad_norm": 1.2744818227486467, + "grad_norm": 1.2591079424158256, "learning_rate": 7.309113676501939e-06, - "loss": 0.2077, + "loss": 0.2094, "step": 2538 }, { "epoch": 0.34733242134062925, - "grad_norm": 1.2169522755961022, + "grad_norm": 1.2368561253960964, "learning_rate": 7.307207508810891e-06, - "loss": 0.2033, + "loss": 0.2069, "step": 2539 }, { "epoch": 0.34746922024623805, - "grad_norm": 1.3183425463837644, + "grad_norm": 1.3114270257008411, "learning_rate": 7.30530091498049e-06, - "loss": 0.1833, + "loss": 0.1832, "step": 2540 }, { "epoch": 0.3476060191518468, - "grad_norm": 1.3973445120320769, + "grad_norm": 1.3682194733960331, "learning_rate": 7.303393895362885e-06, - "loss": 0.2305, + "loss": 0.2298, "step": 2541 }, { "epoch": 0.34774281805745555, - "grad_norm": 1.1671012392537141, + "grad_norm": 1.1624131462687843, "learning_rate": 7.301486450310298e-06, - "loss": 0.1957, + "loss": 0.1962, "step": 2542 }, { "epoch": 0.3478796169630643, - "grad_norm": 1.399018672119812, + "grad_norm": 1.3726251317542912, "learning_rate": 7.2995785801750345e-06, - "loss": 0.1954, + "loss": 0.1945, "step": 2543 }, { "epoch": 0.34801641586867305, - "grad_norm": 1.4187055262486592, + "grad_norm": 1.3903009702125193, "learning_rate": 7.297670285309475e-06, - "loss": 0.213, + "loss": 0.211, "step": 2544 }, { "epoch": 0.3481532147742818, - "grad_norm": 1.3885823017193517, + "grad_norm": 1.3613138384925498, "learning_rate": 7.295761566066081e-06, - "loss": 0.2357, + "loss": 0.2348, "step": 2545 }, { "epoch": 0.34829001367989054, - "grad_norm": 1.4639250327677509, + "grad_norm": 1.4305954990741012, "learning_rate": 7.293852422797392e-06, - "loss": 0.2527, + "loss": 0.2516, "step": 2546 }, { "epoch": 0.3484268125854993, - "grad_norm": 1.5674538997708372, + "grad_norm": 1.5251771604341589, "learning_rate": 7.291942855856025e-06, - "loss": 0.2431, + "loss": 0.2386, "step": 2547 }, { "epoch": 0.3485636114911081, - "grad_norm": 1.1108307655906018, + "grad_norm": 1.0926750803538259, "learning_rate": 7.290032865594672e-06, - "loss": 0.1688, + "loss": 0.1682, "step": 2548 }, { "epoch": 0.34870041039671684, - "grad_norm": 1.7267232195693096, + "grad_norm": 2.119014660858647, "learning_rate": 7.288122452366112e-06, - "loss": 0.271, + "loss": 0.298, "step": 2549 }, { "epoch": 0.3488372093023256, - "grad_norm": 1.2329635337184486, + "grad_norm": 1.223235427207114, "learning_rate": 7.286211616523193e-06, - "loss": 0.2124, + "loss": 0.2123, "step": 2550 }, { "epoch": 0.34897400820793434, - "grad_norm": 1.3363024324985397, + "grad_norm": 1.2915606362889038, "learning_rate": 7.2843003584188465e-06, - "loss": 0.1951, + "loss": 0.1931, "step": 2551 }, { "epoch": 0.3491108071135431, - "grad_norm": 1.5043195167822137, + "grad_norm": 1.4751580148922603, "learning_rate": 7.28238867840608e-06, - "loss": 0.1939, + "loss": 0.192, "step": 2552 }, { "epoch": 0.34924760601915183, - "grad_norm": 1.2642678767498232, + "grad_norm": 1.259617147382147, "learning_rate": 7.2804765768379804e-06, - "loss": 0.1838, + "loss": 0.1812, "step": 2553 }, { "epoch": 0.3493844049247606, - "grad_norm": 1.4731624618619288, + "grad_norm": 1.4603791679444584, "learning_rate": 7.27856405406771e-06, - "loss": 0.2355, + "loss": 0.2361, "step": 2554 }, { "epoch": 0.34952120383036933, - "grad_norm": 1.3138303487638319, + "grad_norm": 1.2798103916392163, "learning_rate": 7.276651110448509e-06, - "loss": 0.2148, + "loss": 0.2138, "step": 2555 }, { "epoch": 0.34965800273597814, - "grad_norm": 1.1255066240746936, + "grad_norm": 1.1039577041863449, "learning_rate": 7.2747377463336974e-06, - "loss": 0.2128, + "loss": 0.2089, "step": 2556 }, { "epoch": 0.3497948016415869, - "grad_norm": 1.336202078896593, + "grad_norm": 1.2980590126222504, "learning_rate": 7.272823962076674e-06, - "loss": 0.221, + "loss": 0.2219, "step": 2557 }, { "epoch": 0.34993160054719563, - "grad_norm": 1.2062218169436216, + "grad_norm": 1.2008987232404706, "learning_rate": 7.270909758030912e-06, - "loss": 0.1956, + "loss": 0.1962, "step": 2558 }, { "epoch": 0.3500683994528044, - "grad_norm": 1.4149104443635658, + "grad_norm": 1.316684745872349, "learning_rate": 7.268995134549961e-06, - "loss": 0.1967, + "loss": 0.1872, "step": 2559 }, { "epoch": 0.35020519835841313, - "grad_norm": 1.3709298059265416, + "grad_norm": 1.3514214405883636, "learning_rate": 7.267080091987454e-06, - "loss": 0.164, + "loss": 0.1639, "step": 2560 }, { "epoch": 0.3503419972640219, - "grad_norm": 1.2519136633110322, + "grad_norm": 1.2295122564605498, "learning_rate": 7.2651646306970956e-06, - "loss": 0.1958, + "loss": 0.1941, "step": 2561 }, { "epoch": 0.3504787961696306, - "grad_norm": 1.5593588034444306, + "grad_norm": 1.5199711513975722, "learning_rate": 7.2632487510326714e-06, - "loss": 0.2202, + "loss": 0.2207, "step": 2562 }, { "epoch": 0.3506155950752394, - "grad_norm": 1.3096632661694798, + "grad_norm": 1.3147958843839387, "learning_rate": 7.26133245334804e-06, - "loss": 0.1999, + "loss": 0.2007, "step": 2563 }, { "epoch": 0.3507523939808482, - "grad_norm": 1.1103870597421548, + "grad_norm": 1.099618402511408, "learning_rate": 7.259415737997143e-06, - "loss": 0.1738, + "loss": 0.1746, "step": 2564 }, { "epoch": 0.3508891928864569, - "grad_norm": 1.1131872346817473, + "grad_norm": 1.0716707051860639, "learning_rate": 7.2574986053339956e-06, - "loss": 0.1767, + "loss": 0.1747, "step": 2565 }, { "epoch": 0.3510259917920657, - "grad_norm": 1.0710788503503796, + "grad_norm": 1.0560404230299223, "learning_rate": 7.2555810557126885e-06, - "loss": 0.1926, + "loss": 0.1913, "step": 2566 }, { "epoch": 0.3511627906976744, - "grad_norm": 1.239283770723868, + "grad_norm": 1.641559437022591, "learning_rate": 7.2536630894873946e-06, - "loss": 0.1864, + "loss": 0.1853, "step": 2567 }, { "epoch": 0.35129958960328317, - "grad_norm": 1.2534327601723514, + "grad_norm": 1.2453757546587076, "learning_rate": 7.25174470701236e-06, - "loss": 0.1935, + "loss": 0.1932, "step": 2568 }, { "epoch": 0.3514363885088919, - "grad_norm": 1.503428153991028, + "grad_norm": 1.502454618713811, "learning_rate": 7.249825908641908e-06, - "loss": 0.221, + "loss": 0.2222, "step": 2569 }, { "epoch": 0.35157318741450067, - "grad_norm": 1.3274856428562531, + "grad_norm": 1.3224250174553556, "learning_rate": 7.247906694730438e-06, - "loss": 0.183, + "loss": 0.1839, "step": 2570 }, { "epoch": 0.3517099863201094, - "grad_norm": 1.3009370948517411, + "grad_norm": 1.2866058013101331, "learning_rate": 7.2459870656324276e-06, - "loss": 0.1977, + "loss": 0.1974, "step": 2571 }, { "epoch": 0.3518467852257182, - "grad_norm": 1.5505830542290093, + "grad_norm": 1.5288468111540896, "learning_rate": 7.24406702170243e-06, - "loss": 0.2091, + "loss": 0.2103, "step": 2572 }, { "epoch": 0.35198358413132697, - "grad_norm": 1.3719571906835561, + "grad_norm": 1.3600620964303973, "learning_rate": 7.2421465632950784e-06, - "loss": 0.1999, + "loss": 0.2008, "step": 2573 }, { "epoch": 0.3521203830369357, - "grad_norm": 1.5678981141855381, + "grad_norm": 1.5308755177943116, "learning_rate": 7.2402256907650795e-06, - "loss": 0.2126, + "loss": 0.2118, "step": 2574 }, { "epoch": 0.35225718194254446, - "grad_norm": 1.5846848082197167, + "grad_norm": 1.3122359157213563, "learning_rate": 7.238304404467215e-06, - "loss": 0.2272, + "loss": 0.2254, "step": 2575 }, { "epoch": 0.3523939808481532, - "grad_norm": 1.3021265697195819, + "grad_norm": 1.28878841195338, "learning_rate": 7.236382704756345e-06, - "loss": 0.1987, + "loss": 0.2011, "step": 2576 }, { "epoch": 0.35253077975376196, - "grad_norm": 1.538389433562678, + "grad_norm": 1.5559504373645212, "learning_rate": 7.234460591987408e-06, - "loss": 0.2321, + "loss": 0.2375, "step": 2577 }, { "epoch": 0.3526675786593707, - "grad_norm": 1.1766432959546245, + "grad_norm": 1.1721728635337498, "learning_rate": 7.232538066515414e-06, - "loss": 0.1933, + "loss": 0.193, "step": 2578 }, { "epoch": 0.35280437756497945, - "grad_norm": 1.303798970151873, + "grad_norm": 1.2867173177405664, "learning_rate": 7.230615128695455e-06, - "loss": 0.2095, + "loss": 0.2082, "step": 2579 }, { "epoch": 0.35294117647058826, - "grad_norm": 1.2908402493120603, + "grad_norm": 1.2649108741633135, "learning_rate": 7.2286917788826926e-06, - "loss": 0.2005, + "loss": 0.1985, "step": 2580 }, { "epoch": 0.353077975376197, - "grad_norm": 1.2473316925168962, + "grad_norm": 1.2327297873818592, "learning_rate": 7.226768017432368e-06, - "loss": 0.2329, + "loss": 0.232, "step": 2581 }, { "epoch": 0.35321477428180575, - "grad_norm": 1.3919666101626431, + "grad_norm": 1.3925048882641078, "learning_rate": 7.2248438446998034e-06, - "loss": 0.2279, + "loss": 0.2286, "step": 2582 }, { "epoch": 0.3533515731874145, - "grad_norm": 1.2904092570911863, + "grad_norm": 1.3060643477357199, "learning_rate": 7.222919261040388e-06, - "loss": 0.2113, + "loss": 0.2148, "step": 2583 }, { "epoch": 0.35348837209302325, - "grad_norm": 1.3677198576862202, + "grad_norm": 1.3494835262064706, "learning_rate": 7.220994266809591e-06, - "loss": 0.2505, + "loss": 0.2479, "step": 2584 }, { "epoch": 0.353625170998632, - "grad_norm": 1.2913532743372438, + "grad_norm": 1.272777324631103, "learning_rate": 7.219068862362957e-06, - "loss": 0.2062, + "loss": 0.1985, "step": 2585 }, { "epoch": 0.35376196990424075, - "grad_norm": 1.0528535336295375, + "grad_norm": 1.0445531702284627, "learning_rate": 7.217143048056108e-06, - "loss": 0.1479, + "loss": 0.1472, "step": 2586 }, { "epoch": 0.3538987688098495, - "grad_norm": 1.2944498904506292, + "grad_norm": 1.2792199577479093, "learning_rate": 7.215216824244738e-06, - "loss": 0.1914, + "loss": 0.192, "step": 2587 }, { "epoch": 0.3540355677154583, - "grad_norm": 1.1382335630657279, + "grad_norm": 1.1275029296486663, "learning_rate": 7.2132901912846206e-06, - "loss": 0.2089, + "loss": 0.2109, "step": 2588 }, { "epoch": 0.35417236662106705, - "grad_norm": 1.3424779315701845, + "grad_norm": 1.3170231250131554, "learning_rate": 7.211363149531605e-06, - "loss": 0.1898, + "loss": 0.1889, "step": 2589 }, { "epoch": 0.3543091655266758, - "grad_norm": 0.9041396639687036, + "grad_norm": 0.8902794891707537, "learning_rate": 7.2094356993416134e-06, - "loss": 0.1399, + "loss": 0.138, "step": 2590 }, { "epoch": 0.35444596443228454, - "grad_norm": 1.3587683153126175, + "grad_norm": 1.3456607759245818, "learning_rate": 7.2075078410706425e-06, - "loss": 0.1637, + "loss": 0.1622, "step": 2591 }, { "epoch": 0.3545827633378933, - "grad_norm": 1.3256164175823428, + "grad_norm": 1.307239747677316, "learning_rate": 7.205579575074766e-06, - "loss": 0.1895, + "loss": 0.1861, "step": 2592 }, { "epoch": 0.35471956224350204, - "grad_norm": 1.2794649033385725, + "grad_norm": 1.2445856163344755, "learning_rate": 7.203650901710135e-06, - "loss": 0.2006, + "loss": 0.1975, "step": 2593 }, { "epoch": 0.3548563611491108, - "grad_norm": 1.3319987858533315, + "grad_norm": 1.3140025062394385, "learning_rate": 7.201721821332973e-06, - "loss": 0.1919, + "loss": 0.1914, "step": 2594 }, { "epoch": 0.35499316005471954, - "grad_norm": 1.3812152912652171, + "grad_norm": 1.3608015730294005, "learning_rate": 7.19979233429958e-06, - "loss": 0.1916, + "loss": 0.1893, "step": 2595 }, { "epoch": 0.35512995896032834, - "grad_norm": 1.2963627646348952, + "grad_norm": 1.314695802671204, "learning_rate": 7.19786244096633e-06, - "loss": 0.1857, + "loss": 0.1868, "step": 2596 }, { "epoch": 0.3552667578659371, - "grad_norm": 1.4339036520512962, + "grad_norm": 1.439741659180654, "learning_rate": 7.195932141689673e-06, - "loss": 0.2415, + "loss": 0.2425, "step": 2597 }, { "epoch": 0.35540355677154584, - "grad_norm": 1.3090394983090827, + "grad_norm": 1.2987715005385911, "learning_rate": 7.194001436826135e-06, - "loss": 0.2087, + "loss": 0.2098, "step": 2598 }, { "epoch": 0.3555403556771546, - "grad_norm": 1.3591286718229338, + "grad_norm": 1.3671117221422386, "learning_rate": 7.192070326732312e-06, - "loss": 0.2301, + "loss": 0.2318, "step": 2599 }, { "epoch": 0.35567715458276333, - "grad_norm": 1.6970894642139716, + "grad_norm": 1.6566806179088611, "learning_rate": 7.190138811764883e-06, - "loss": 0.261, + "loss": 0.2614, "step": 2600 }, { "epoch": 0.35567715458276333, - "eval_loss": 0.19764621555805206, - "eval_runtime": 5.9394, - "eval_samples_per_second": 5.051, - "eval_steps_per_second": 1.347, + "eval_loss": 0.19636346399784088, + "eval_runtime": 5.9324, + "eval_samples_per_second": 5.057, + "eval_steps_per_second": 1.349, "step": 2600 }, { "epoch": 0.3558139534883721, - "grad_norm": 1.3426651894852855, + "grad_norm": 1.3250710112422575, "learning_rate": 7.188206892280595e-06, - "loss": 0.2059, + "loss": 0.2055, "step": 2601 }, { "epoch": 0.35595075239398083, - "grad_norm": 1.094705328877852, + "grad_norm": 1.1157273716888818, "learning_rate": 7.18627456863627e-06, - "loss": 0.2028, + "loss": 0.2041, "step": 2602 }, { "epoch": 0.3560875512995896, - "grad_norm": 1.0539734399238252, + "grad_norm": 1.0427551415006984, "learning_rate": 7.184341841188811e-06, - "loss": 0.1842, + "loss": 0.1862, "step": 2603 }, { "epoch": 0.3562243502051984, - "grad_norm": 1.3729877768451213, + "grad_norm": 1.3635024506500866, "learning_rate": 7.182408710295186e-06, - "loss": 0.1913, + "loss": 0.1916, "step": 2604 }, { "epoch": 0.35636114911080713, - "grad_norm": 1.2231806834080865, + "grad_norm": 1.1719160469794538, "learning_rate": 7.180475176312449e-06, - "loss": 0.1797, + "loss": 0.1779, "step": 2605 }, { "epoch": 0.3564979480164159, - "grad_norm": 1.4609275798466155, + "grad_norm": 1.4493277457154197, "learning_rate": 7.1785412395977174e-06, - "loss": 0.2144, + "loss": 0.2135, "step": 2606 }, { "epoch": 0.3566347469220246, - "grad_norm": 1.411296979176403, + "grad_norm": 1.3366962854407831, "learning_rate": 7.176606900508189e-06, - "loss": 0.2045, + "loss": 0.2035, "step": 2607 }, { "epoch": 0.3567715458276334, - "grad_norm": 1.3945631015851918, + "grad_norm": 1.3887372934462492, "learning_rate": 7.174672159401135e-06, - "loss": 0.2097, + "loss": 0.2101, "step": 2608 }, { "epoch": 0.3569083447332421, - "grad_norm": 1.1987043015900856, + "grad_norm": 1.1880340059069108, "learning_rate": 7.1727370166338995e-06, - "loss": 0.217, + "loss": 0.2177, "step": 2609 }, { "epoch": 0.35704514363885087, - "grad_norm": 1.2964673997243712, + "grad_norm": 1.3429072928031633, "learning_rate": 7.170801472563903e-06, - "loss": 0.1917, + "loss": 0.1941, "step": 2610 }, { "epoch": 0.3571819425444596, - "grad_norm": 1.5922567667668373, + "grad_norm": 1.571247335097738, "learning_rate": 7.1688655275486386e-06, - "loss": 0.2637, + "loss": 0.2659, "step": 2611 }, { "epoch": 0.3573187414500684, - "grad_norm": 1.1767174808626109, + "grad_norm": 1.1650463824084103, "learning_rate": 7.166929181945673e-06, - "loss": 0.1815, + "loss": 0.1838, "step": 2612 }, { "epoch": 0.35745554035567717, - "grad_norm": 1.2936422639563776, + "grad_norm": 1.258568055122888, "learning_rate": 7.164992436112649e-06, - "loss": 0.2191, + "loss": 0.2171, "step": 2613 }, { "epoch": 0.3575923392612859, - "grad_norm": 2.1207777527099263, + "grad_norm": 2.3098031394425695, "learning_rate": 7.163055290407282e-06, - "loss": 0.2007, + "loss": 0.1919, "step": 2614 }, { "epoch": 0.35772913816689467, - "grad_norm": 1.3965524996478782, + "grad_norm": 1.3729615736714986, "learning_rate": 7.161117745187359e-06, - "loss": 0.2446, + "loss": 0.2458, "step": 2615 }, { "epoch": 0.3578659370725034, - "grad_norm": 1.654116222740577, + "grad_norm": 1.6352513535915232, "learning_rate": 7.159179800810745e-06, - "loss": 0.2658, + "loss": 0.2653, "step": 2616 }, { "epoch": 0.35800273597811216, - "grad_norm": 1.3494365175149898, + "grad_norm": 1.342713000061138, "learning_rate": 7.157241457635375e-06, - "loss": 0.1759, + "loss": 0.1778, "step": 2617 }, { "epoch": 0.3581395348837209, - "grad_norm": 1.3479646630374877, + "grad_norm": 1.34634555842858, "learning_rate": 7.155302716019263e-06, - "loss": 0.2243, + "loss": 0.2241, "step": 2618 }, { "epoch": 0.35827633378932966, - "grad_norm": 1.2460625407587425, + "grad_norm": 1.270056944734388, "learning_rate": 7.15336357632049e-06, - "loss": 0.1814, + "loss": 0.1815, "step": 2619 }, { "epoch": 0.35841313269493846, - "grad_norm": 1.2014267388126365, + "grad_norm": 1.1698533062783893, "learning_rate": 7.151424038897214e-06, - "loss": 0.1733, + "loss": 0.1731, "step": 2620 }, { "epoch": 0.3585499316005472, - "grad_norm": 1.2260272534323908, + "grad_norm": 1.2201380955360017, "learning_rate": 7.149484104107665e-06, - "loss": 0.1926, + "loss": 0.1913, "step": 2621 }, { "epoch": 0.35868673050615596, - "grad_norm": 1.4350944901028564, + "grad_norm": 1.4127393320089054, "learning_rate": 7.14754377231015e-06, - "loss": 0.2094, + "loss": 0.2039, "step": 2622 }, { "epoch": 0.3588235294117647, - "grad_norm": 1.4248003706415278, + "grad_norm": 1.4402823334386141, "learning_rate": 7.145603043863045e-06, - "loss": 0.1961, + "loss": 0.1967, "step": 2623 }, { "epoch": 0.35896032831737346, - "grad_norm": 1.1554411557366797, + "grad_norm": 1.1668618422302153, "learning_rate": 7.143661919124801e-06, - "loss": 0.191, + "loss": 0.1913, "step": 2624 }, { "epoch": 0.3590971272229822, - "grad_norm": 1.542325710969349, + "grad_norm": 1.558877401541979, "learning_rate": 7.141720398453944e-06, - "loss": 0.2041, + "loss": 0.207, "step": 2625 }, { "epoch": 0.35923392612859095, - "grad_norm": 1.1466914447430687, + "grad_norm": 1.135318036853384, "learning_rate": 7.1397784822090675e-06, - "loss": 0.1738, + "loss": 0.1727, "step": 2626 }, { "epoch": 0.3593707250341997, - "grad_norm": 1.3294786848715232, + "grad_norm": 1.317324702620019, "learning_rate": 7.137836170748847e-06, - "loss": 0.2265, + "loss": 0.2272, "step": 2627 }, { "epoch": 0.3595075239398085, - "grad_norm": 1.1462643423131398, + "grad_norm": 1.1234482696687051, "learning_rate": 7.135893464432019e-06, - "loss": 0.1941, + "loss": 0.1933, "step": 2628 }, { "epoch": 0.35964432284541725, - "grad_norm": 1.651541531679734, + "grad_norm": 1.6746948957101504, "learning_rate": 7.133950363617406e-06, - "loss": 0.2568, + "loss": 0.2587, "step": 2629 }, { "epoch": 0.359781121751026, - "grad_norm": 1.2642405470324736, + "grad_norm": 1.2750396590635213, "learning_rate": 7.132006868663895e-06, - "loss": 0.1975, + "loss": 0.1991, "step": 2630 }, { "epoch": 0.35991792065663475, - "grad_norm": 1.4792509295421208, + "grad_norm": 1.455759572367808, "learning_rate": 7.130062979930448e-06, - "loss": 0.2351, + "loss": 0.2324, "step": 2631 }, { "epoch": 0.3600547195622435, - "grad_norm": 1.357885796216185, + "grad_norm": 1.3721121466256079, "learning_rate": 7.128118697776097e-06, - "loss": 0.2439, + "loss": 0.2498, "step": 2632 }, { "epoch": 0.36019151846785225, - "grad_norm": 1.120198235547101, + "grad_norm": 1.1135239595165307, "learning_rate": 7.126174022559954e-06, - "loss": 0.177, + "loss": 0.178, "step": 2633 }, { "epoch": 0.360328317373461, - "grad_norm": 1.3013231771858598, + "grad_norm": 1.275515136533389, "learning_rate": 7.124228954641196e-06, - "loss": 0.2166, + "loss": 0.2124, "step": 2634 }, { "epoch": 0.36046511627906974, - "grad_norm": 1.3213417007048953, + "grad_norm": 1.3113492211524893, "learning_rate": 7.122283494379076e-06, - "loss": 0.1893, + "loss": 0.1871, "step": 2635 }, { "epoch": 0.36060191518467855, - "grad_norm": 1.3665330990202684, + "grad_norm": 1.342675241356422, "learning_rate": 7.120337642132919e-06, - "loss": 0.2013, + "loss": 0.1968, "step": 2636 }, { "epoch": 0.3607387140902873, - "grad_norm": 1.3689910112107837, + "grad_norm": 1.3368882298631815, "learning_rate": 7.1183913982621235e-06, - "loss": 0.2089, + "loss": 0.2076, "step": 2637 }, { "epoch": 0.36087551299589604, - "grad_norm": 1.3628418399807571, + "grad_norm": 1.3855582130373494, "learning_rate": 7.116444763126158e-06, - "loss": 0.1949, + "loss": 0.1937, "step": 2638 }, { "epoch": 0.3610123119015048, - "grad_norm": 1.3744164650151667, + "grad_norm": 1.3801713609648676, "learning_rate": 7.114497737084563e-06, - "loss": 0.2328, + "loss": 0.2325, "step": 2639 }, { "epoch": 0.36114911080711354, - "grad_norm": 1.3301355927271603, + "grad_norm": 1.3173405465514647, "learning_rate": 7.112550320496955e-06, - "loss": 0.2129, + "loss": 0.213, "step": 2640 }, { "epoch": 0.3612859097127223, - "grad_norm": 1.4473018031527, + "grad_norm": 1.4251171827232811, "learning_rate": 7.110602513723019e-06, - "loss": 0.2357, + "loss": 0.2359, "step": 2641 }, { "epoch": 0.36142270861833103, - "grad_norm": 1.4664565303363346, + "grad_norm": 1.461857063310874, "learning_rate": 7.108654317122515e-06, - "loss": 0.1998, + "loss": 0.2011, "step": 2642 }, { "epoch": 0.3615595075239398, - "grad_norm": 1.104062167273439, + "grad_norm": 1.1509578911958798, "learning_rate": 7.106705731055272e-06, "loss": 0.1433, "step": 2643 }, { "epoch": 0.3616963064295486, - "grad_norm": 1.2716557038856977, + "grad_norm": 1.2599058761705906, "learning_rate": 7.1047567558811905e-06, - "loss": 0.1578, + "loss": 0.1584, "step": 2644 }, { "epoch": 0.36183310533515733, - "grad_norm": 1.267201209883456, + "grad_norm": 1.3285061335823993, "learning_rate": 7.1028073919602484e-06, - "loss": 0.2016, + "loss": 0.1979, "step": 2645 }, { "epoch": 0.3619699042407661, - "grad_norm": 1.062615502100426, + "grad_norm": 1.0511805574134947, "learning_rate": 7.10085763965249e-06, - "loss": 0.1859, + "loss": 0.1809, "step": 2646 }, { "epoch": 0.36210670314637483, - "grad_norm": 1.4570326512189917, + "grad_norm": 1.437367847572388, "learning_rate": 7.09890749931803e-06, - "loss": 0.2744, + "loss": 0.2734, "step": 2647 }, { "epoch": 0.3622435020519836, - "grad_norm": 1.33512399767249, + "grad_norm": 1.3350968195369581, "learning_rate": 7.096956971317061e-06, - "loss": 0.2062, + "loss": 0.2083, "step": 2648 }, { "epoch": 0.3623803009575923, - "grad_norm": 1.0902230922288108, + "grad_norm": 1.097809239999261, "learning_rate": 7.095006056009846e-06, - "loss": 0.1765, + "loss": 0.177, "step": 2649 }, { "epoch": 0.3625170998632011, - "grad_norm": 1.32553086913143, + "grad_norm": 1.3054906505632966, "learning_rate": 7.0930547537567125e-06, - "loss": 0.2381, + "loss": 0.2368, "step": 2650 }, { "epoch": 0.3626538987688098, - "grad_norm": 1.60022069332575, + "grad_norm": 1.565449593302143, "learning_rate": 7.0911030649180675e-06, - "loss": 0.2367, + "loss": 0.2377, "step": 2651 }, { "epoch": 0.3627906976744186, - "grad_norm": 1.2515652624477664, + "grad_norm": 1.2439251008422025, "learning_rate": 7.089150989854385e-06, - "loss": 0.1982, + "loss": 0.1991, "step": 2652 }, { "epoch": 0.3629274965800274, - "grad_norm": 1.2915799585193075, + "grad_norm": 1.2694667572244864, "learning_rate": 7.087198528926214e-06, - "loss": 0.1939, + "loss": 0.1941, "step": 2653 }, { "epoch": 0.3630642954856361, - "grad_norm": 1.2056688956680748, + "grad_norm": 1.2021074983821964, "learning_rate": 7.085245682494168e-06, - "loss": 0.1961, + "loss": 0.1953, "step": 2654 }, { "epoch": 0.36320109439124487, - "grad_norm": 1.0855054889101536, + "grad_norm": 1.152744783496276, "learning_rate": 7.08329245091894e-06, - "loss": 0.1502, + "loss": 0.1513, "step": 2655 }, { "epoch": 0.3633378932968536, - "grad_norm": 1.5136057018112206, + "grad_norm": 1.4957832065618886, "learning_rate": 7.08133883456129e-06, - "loss": 0.2235, + "loss": 0.2214, "step": 2656 }, { "epoch": 0.36347469220246237, - "grad_norm": 1.4962707527770154, + "grad_norm": 1.4863858977361728, "learning_rate": 7.0793848337820484e-06, - "loss": 0.19, + "loss": 0.1916, "step": 2657 }, { "epoch": 0.3636114911080711, - "grad_norm": 1.5008504149607065, + "grad_norm": 1.4687971067372863, "learning_rate": 7.077430448942117e-06, - "loss": 0.2623, + "loss": 0.2608, "step": 2658 }, { "epoch": 0.36374829001367986, - "grad_norm": 1.3995630891802715, + "grad_norm": 1.3830618930000098, "learning_rate": 7.07547568040247e-06, - "loss": 0.1931, + "loss": 0.1942, "step": 2659 }, { "epoch": 0.36388508891928867, - "grad_norm": 1.3718360099234985, + "grad_norm": 1.3835064461720712, "learning_rate": 7.073520528524152e-06, - "loss": 0.2411, + "loss": 0.2449, "step": 2660 }, { "epoch": 0.3640218878248974, - "grad_norm": 1.2998006030861946, + "grad_norm": 1.2911867914903978, "learning_rate": 7.071564993668274e-06, - "loss": 0.2034, + "loss": 0.2037, "step": 2661 }, { "epoch": 0.36415868673050616, - "grad_norm": 1.31757963501023, + "grad_norm": 1.29665109327402, "learning_rate": 7.069609076196029e-06, - "loss": 0.193, + "loss": 0.1942, "step": 2662 }, { "epoch": 0.3642954856361149, - "grad_norm": 1.324957252313946, + "grad_norm": 1.3016092061879294, "learning_rate": 7.0676527764686675e-06, - "loss": 0.2026, + "loss": 0.2014, "step": 2663 }, { "epoch": 0.36443228454172366, - "grad_norm": 1.1288654851533246, + "grad_norm": 1.119560790887201, "learning_rate": 7.065696094847519e-06, - "loss": 0.183, + "loss": 0.184, "step": 2664 }, { "epoch": 0.3645690834473324, - "grad_norm": 1.244433116303088, + "grad_norm": 1.235633763146319, "learning_rate": 7.0637390316939805e-06, - "loss": 0.1929, + "loss": 0.1934, "step": 2665 }, { "epoch": 0.36470588235294116, - "grad_norm": 1.0516362462780668, + "grad_norm": 1.0729409967905446, "learning_rate": 7.061781587369518e-06, - "loss": 0.1762, + "loss": 0.1807, "step": 2666 }, { "epoch": 0.3648426812585499, - "grad_norm": 1.4563351135919582, + "grad_norm": 1.4361926598948853, "learning_rate": 7.0598237622356735e-06, - "loss": 0.2139, + "loss": 0.2117, "step": 2667 }, { "epoch": 0.3649794801641587, - "grad_norm": 1.3175525365512906, + "grad_norm": 1.2981879253297537, "learning_rate": 7.057865556654053e-06, "loss": 0.2063, "step": 2668 }, { "epoch": 0.36511627906976746, - "grad_norm": 1.1423124978232728, + "grad_norm": 1.1208976521137568, "learning_rate": 7.055906970986336e-06, - "loss": 0.1703, + "loss": 0.1689, "step": 2669 }, { "epoch": 0.3652530779753762, - "grad_norm": 1.256537089414045, + "grad_norm": 1.2226879033885005, "learning_rate": 7.053948005594273e-06, - "loss": 0.2148, + "loss": 0.2132, "step": 2670 }, { "epoch": 0.36538987688098495, - "grad_norm": 1.351325973963135, + "grad_norm": 1.3170773180451971, "learning_rate": 7.051988660839681e-06, - "loss": 0.242, + "loss": 0.2356, "step": 2671 }, { "epoch": 0.3655266757865937, - "grad_norm": 1.391330319494366, + "grad_norm": 1.3698859578110465, "learning_rate": 7.050028937084453e-06, - "loss": 0.2088, + "loss": 0.2068, "step": 2672 }, { "epoch": 0.36566347469220245, - "grad_norm": 1.381629778519383, + "grad_norm": 1.3572236633013148, "learning_rate": 7.048068834690544e-06, - "loss": 0.2244, + "loss": 0.2256, "step": 2673 }, { "epoch": 0.3658002735978112, - "grad_norm": 1.450084308634857, + "grad_norm": 1.3967821818053823, "learning_rate": 7.046108354019987e-06, - "loss": 0.2236, + "loss": 0.2182, "step": 2674 }, { "epoch": 0.36593707250341995, - "grad_norm": 1.1170427457598715, + "grad_norm": 1.1176570436338997, "learning_rate": 7.044147495434879e-06, - "loss": 0.1869, + "loss": 0.1864, "step": 2675 }, { "epoch": 0.36607387140902875, - "grad_norm": 1.1181419775128372, + "grad_norm": 1.1043075164731029, "learning_rate": 7.0421862592973885e-06, - "loss": 0.1976, + "loss": 0.1957, "step": 2676 }, { "epoch": 0.3662106703146375, - "grad_norm": 1.2824981189481346, + "grad_norm": 1.2695545559119745, "learning_rate": 7.0402246459697565e-06, - "loss": 0.1876, + "loss": 0.1873, "step": 2677 }, { "epoch": 0.36634746922024625, - "grad_norm": 1.571493491994605, + "grad_norm": 4.436899058058753, "learning_rate": 7.038262655814291e-06, - "loss": 0.2249, + "loss": 0.2212, "step": 2678 }, { "epoch": 0.366484268125855, - "grad_norm": 1.3915730884013955, + "grad_norm": 1.3530205878010688, "learning_rate": 7.036300289193369e-06, - "loss": 0.2004, + "loss": 0.1994, "step": 2679 }, { "epoch": 0.36662106703146374, - "grad_norm": 1.231463976351558, + "grad_norm": 1.2167960481073947, "learning_rate": 7.034337546469436e-06, - "loss": 0.2278, + "loss": 0.2282, "step": 2680 }, { "epoch": 0.3667578659370725, - "grad_norm": 1.1706769242389017, + "grad_norm": 1.1295560781482479, "learning_rate": 7.032374428005014e-06, - "loss": 0.169, + "loss": 0.1668, "step": 2681 }, { "epoch": 0.36689466484268124, - "grad_norm": 1.5301857927905496, + "grad_norm": 1.5317952100782763, "learning_rate": 7.030410934162685e-06, - "loss": 0.2584, + "loss": 0.2599, "step": 2682 }, { "epoch": 0.36703146374829, - "grad_norm": 1.2164551693013013, + "grad_norm": 1.2065268290855409, "learning_rate": 7.028447065305107e-06, - "loss": 0.1942, + "loss": 0.1945, "step": 2683 }, { "epoch": 0.3671682626538988, - "grad_norm": 1.345478566981935, + "grad_norm": 1.2967718938253585, "learning_rate": 7.026482821795003e-06, - "loss": 0.221, + "loss": 0.219, "step": 2684 }, { "epoch": 0.36730506155950754, - "grad_norm": 1.5397925267776302, + "grad_norm": 1.510671684599019, "learning_rate": 7.02451820399517e-06, - "loss": 0.2198, + "loss": 0.2189, "step": 2685 }, { "epoch": 0.3674418604651163, - "grad_norm": 1.4712689709142874, + "grad_norm": 1.3956177981700777, "learning_rate": 7.022553212268469e-06, - "loss": 0.2312, + "loss": 0.2293, "step": 2686 }, { "epoch": 0.36757865937072504, - "grad_norm": 1.2730962721919892, + "grad_norm": 3.434558581673476, "learning_rate": 7.020587846977833e-06, - "loss": 0.2072, + "loss": 0.221, "step": 2687 }, { "epoch": 0.3677154582763338, - "grad_norm": 1.2645390721260221, + "grad_norm": 1.2628542414169257, "learning_rate": 7.018622108486265e-06, - "loss": 0.2055, + "loss": 0.2064, "step": 2688 }, { "epoch": 0.36785225718194253, - "grad_norm": 1.0967566323721147, + "grad_norm": 1.1019522114700284, "learning_rate": 7.016655997156834e-06, - "loss": 0.1903, + "loss": 0.1886, "step": 2689 }, { "epoch": 0.3679890560875513, - "grad_norm": 1.1832803841829997, + "grad_norm": 1.1607376628763628, "learning_rate": 7.014689513352676e-06, - "loss": 0.2044, + "loss": 0.2025, "step": 2690 }, { "epoch": 0.36812585499316003, - "grad_norm": 1.0746268241238903, + "grad_norm": 1.0637644160158946, "learning_rate": 7.012722657437005e-06, - "loss": 0.1849, + "loss": 0.1832, "step": 2691 }, { "epoch": 0.36826265389876883, - "grad_norm": 1.4387664292636826, + "grad_norm": 1.4061685754573598, "learning_rate": 7.010755429773092e-06, - "loss": 0.2252, + "loss": 0.2235, "step": 2692 }, { "epoch": 0.3683994528043776, - "grad_norm": 1.5072292801064597, + "grad_norm": 1.4583797236287706, "learning_rate": 7.008787830724286e-06, - "loss": 0.2305, + "loss": 0.2303, "step": 2693 }, { "epoch": 0.36853625170998633, - "grad_norm": 1.1389106909771467, + "grad_norm": 1.1177792226244465, "learning_rate": 7.006819860654002e-06, - "loss": 0.1935, + "loss": 0.1942, "step": 2694 }, { "epoch": 0.3686730506155951, - "grad_norm": 1.1465227686118946, + "grad_norm": 1.1740953982801525, "learning_rate": 7.0048515199257185e-06, - "loss": 0.1602, + "loss": 0.1657, "step": 2695 }, { "epoch": 0.3688098495212038, - "grad_norm": 1.3498848596218813, + "grad_norm": 1.3171484921794137, "learning_rate": 7.002882808902989e-06, - "loss": 0.1927, + "loss": 0.193, "step": 2696 }, { "epoch": 0.3689466484268126, - "grad_norm": 1.448658072536231, + "grad_norm": 1.4454821454554392, "learning_rate": 7.000913727949431e-06, - "loss": 0.2048, + "loss": 0.2044, "step": 2697 }, { "epoch": 0.3690834473324213, - "grad_norm": 1.4359105785768427, + "grad_norm": 1.4202751808171554, "learning_rate": 6.998944277428734e-06, - "loss": 0.2138, + "loss": 0.2117, "step": 2698 }, { "epoch": 0.36922024623803007, - "grad_norm": 1.1121338272830374, + "grad_norm": 1.099495606147722, "learning_rate": 6.9969744577046536e-06, - "loss": 0.1369, + "loss": 0.135, "step": 2699 }, { "epoch": 0.3693570451436389, - "grad_norm": 1.245425926728154, + "grad_norm": 1.215413382816999, "learning_rate": 6.995004269141013e-06, "loss": 0.193, "step": 2700 }, { "epoch": 0.3693570451436389, - "eval_loss": 0.19416804611682892, - "eval_runtime": 5.9294, - "eval_samples_per_second": 5.059, - "eval_steps_per_second": 1.349, + "eval_loss": 0.1935214102268219, + "eval_runtime": 5.9344, + "eval_samples_per_second": 5.055, + "eval_steps_per_second": 1.348, "step": 2700 }, { "epoch": 0.3694938440492476, - "grad_norm": 1.0990103858514364, + "grad_norm": 1.0883515436396265, "learning_rate": 6.993033712101707e-06, - "loss": 0.2097, + "loss": 0.2067, "step": 2701 }, { "epoch": 0.36963064295485637, - "grad_norm": 1.2704509060551943, + "grad_norm": 1.2298725935047075, "learning_rate": 6.991062786950692e-06, - "loss": 0.1864, + "loss": 0.1838, "step": 2702 }, { "epoch": 0.3697674418604651, - "grad_norm": 1.215666152242995, + "grad_norm": 1.2059749497221577, "learning_rate": 6.989091494051998e-06, - "loss": 0.2136, + "loss": 0.2165, "step": 2703 }, { "epoch": 0.36990424076607387, - "grad_norm": 1.2186534772923208, + "grad_norm": 1.175588896405292, "learning_rate": 6.987119833769722e-06, - "loss": 0.1897, + "loss": 0.1889, "step": 2704 }, { "epoch": 0.3700410396716826, - "grad_norm": 1.2212956973693592, + "grad_norm": 1.1906422484361259, "learning_rate": 6.985147806468027e-06, - "loss": 0.2053, + "loss": 0.2059, "step": 2705 }, { "epoch": 0.37017783857729136, - "grad_norm": 1.5983464510975536, + "grad_norm": 1.5774661547154254, "learning_rate": 6.983175412511145e-06, - "loss": 0.2258, + "loss": 0.2243, "step": 2706 }, { "epoch": 0.3703146374829001, - "grad_norm": 1.1941455815315971, + "grad_norm": 1.1670736675836746, "learning_rate": 6.9812026522633765e-06, - "loss": 0.1989, + "loss": 0.1992, "step": 2707 }, { "epoch": 0.3704514363885089, - "grad_norm": 1.2415067027815903, + "grad_norm": 1.210695283787717, "learning_rate": 6.979229526089087e-06, - "loss": 0.1918, + "loss": 0.191, "step": 2708 }, { "epoch": 0.37058823529411766, - "grad_norm": 1.4682400661095356, + "grad_norm": 1.4224929547251985, "learning_rate": 6.977256034352713e-06, - "loss": 0.2356, + "loss": 0.232, "step": 2709 }, { "epoch": 0.3707250341997264, - "grad_norm": 1.2638134509427954, + "grad_norm": 1.2622102011379905, "learning_rate": 6.975282177418756e-06, - "loss": 0.2338, + "loss": 0.2359, "step": 2710 }, { "epoch": 0.37086183310533516, - "grad_norm": 1.4490686781865674, + "grad_norm": 1.395609659041359, "learning_rate": 6.973307955651787e-06, - "loss": 0.2385, + "loss": 0.2376, "step": 2711 }, { "epoch": 0.3709986320109439, - "grad_norm": 1.3289654966930917, + "grad_norm": 1.2896860668627224, "learning_rate": 6.971333369416439e-06, - "loss": 0.2071, + "loss": 0.203, "step": 2712 }, { "epoch": 0.37113543091655266, - "grad_norm": 1.3982316287236327, + "grad_norm": 1.3847358407280201, "learning_rate": 6.969358419077424e-06, - "loss": 0.1676, + "loss": 0.1667, "step": 2713 }, { "epoch": 0.3712722298221614, - "grad_norm": 1.3320159971506786, + "grad_norm": 1.315847716352278, "learning_rate": 6.967383104999506e-06, - "loss": 0.1843, + "loss": 0.1863, "step": 2714 }, { "epoch": 0.37140902872777015, - "grad_norm": 1.0918624956508525, + "grad_norm": 1.0856774935790516, "learning_rate": 6.965407427547528e-06, - "loss": 0.1962, + "loss": 0.1955, "step": 2715 }, { "epoch": 0.37154582763337896, - "grad_norm": 1.3930286595465884, + "grad_norm": 1.373568916990659, "learning_rate": 6.9634313870863945e-06, - "loss": 0.2101, + "loss": 0.208, "step": 2716 }, { "epoch": 0.3716826265389877, - "grad_norm": 1.195479237117091, + "grad_norm": 1.162071695454725, "learning_rate": 6.961454983981079e-06, - "loss": 0.1761, + "loss": 0.1752, "step": 2717 }, { "epoch": 0.37181942544459645, - "grad_norm": 1.1937953523160028, + "grad_norm": 1.22249193960803, "learning_rate": 6.959478218596625e-06, - "loss": 0.2013, + "loss": 0.2016, "step": 2718 }, { "epoch": 0.3719562243502052, - "grad_norm": 1.1746004752796246, + "grad_norm": 1.1790843799684148, "learning_rate": 6.957501091298133e-06, - "loss": 0.1981, + "loss": 0.1997, "step": 2719 }, { "epoch": 0.37209302325581395, - "grad_norm": 0.9788229744463318, + "grad_norm": 0.9548516939169153, "learning_rate": 6.95552360245078e-06, - "loss": 0.1527, + "loss": 0.1517, "step": 2720 }, { "epoch": 0.3722298221614227, - "grad_norm": 1.2389371252542616, + "grad_norm": 1.2182201444103002, "learning_rate": 6.953545752419808e-06, - "loss": 0.207, + "loss": 0.2063, "step": 2721 }, { "epoch": 0.37236662106703144, - "grad_norm": 1.3749569180741383, + "grad_norm": 1.3667511123562495, "learning_rate": 6.951567541570523e-06, - "loss": 0.1948, + "loss": 0.1951, "step": 2722 }, { "epoch": 0.3725034199726402, - "grad_norm": 1.183325373684398, + "grad_norm": 1.1773160676740575, "learning_rate": 6.949588970268299e-06, "loss": 0.1991, "step": 2723 }, { "epoch": 0.372640218878249, - "grad_norm": 1.2338076565992322, + "grad_norm": 1.2167285637024419, "learning_rate": 6.947610038878575e-06, - "loss": 0.2068, + "loss": 0.2054, "step": 2724 }, { "epoch": 0.37277701778385774, - "grad_norm": 1.179224655916963, + "grad_norm": 1.1649895507797117, "learning_rate": 6.94563074776686e-06, - "loss": 0.2077, + "loss": 0.2072, "step": 2725 }, { "epoch": 0.3729138166894665, - "grad_norm": 1.2422792848183457, + "grad_norm": 1.2146646731738313, "learning_rate": 6.943651097298727e-06, - "loss": 0.2075, + "loss": 0.2064, "step": 2726 }, { "epoch": 0.37305061559507524, - "grad_norm": 1.1010083966406838, + "grad_norm": 1.098906652860567, "learning_rate": 6.941671087839815e-06, - "loss": 0.1644, + "loss": 0.1637, "step": 2727 }, { "epoch": 0.373187414500684, - "grad_norm": 1.2161705641947613, + "grad_norm": 1.2078429124288093, "learning_rate": 6.939690719755832e-06, - "loss": 0.2133, + "loss": 0.2131, "step": 2728 }, { "epoch": 0.37332421340629274, - "grad_norm": 1.328414280846452, + "grad_norm": 1.28946109563606, "learning_rate": 6.937709993412549e-06, - "loss": 0.2165, + "loss": 0.2149, "step": 2729 }, { "epoch": 0.3734610123119015, - "grad_norm": 1.4773750007020994, + "grad_norm": 1.4794063529262214, "learning_rate": 6.935728909175805e-06, - "loss": 0.2655, + "loss": 0.2639, "step": 2730 }, { "epoch": 0.37359781121751023, - "grad_norm": 1.3317559625380586, + "grad_norm": 1.296049300499721, "learning_rate": 6.933747467411504e-06, - "loss": 0.2066, + "loss": 0.2054, "step": 2731 }, { "epoch": 0.37373461012311904, - "grad_norm": 1.2492423793061385, + "grad_norm": 1.2324547777032615, "learning_rate": 6.931765668485616e-06, - "loss": 0.1901, + "loss": 0.1867, "step": 2732 }, { "epoch": 0.3738714090287278, - "grad_norm": 1.3913916824934032, + "grad_norm": 1.3459129584140084, "learning_rate": 6.929783512764181e-06, - "loss": 0.1969, + "loss": 0.1961, "step": 2733 }, { "epoch": 0.37400820793433653, - "grad_norm": 1.4312001268154266, + "grad_norm": 1.4063627541093497, "learning_rate": 6.927801000613298e-06, - "loss": 0.2126, + "loss": 0.2128, "step": 2734 }, { "epoch": 0.3741450068399453, - "grad_norm": 1.5643755833949597, + "grad_norm": 1.5626281415772416, "learning_rate": 6.9258181323991354e-06, - "loss": 0.2335, + "loss": 0.2319, "step": 2735 }, { "epoch": 0.37428180574555403, - "grad_norm": 1.1353117703900253, + "grad_norm": 1.135736074017847, "learning_rate": 6.92383490848793e-06, - "loss": 0.2071, + "loss": 0.2074, "step": 2736 }, { "epoch": 0.3744186046511628, - "grad_norm": 1.201903106735783, + "grad_norm": 1.1645943248490909, "learning_rate": 6.921851329245981e-06, - "loss": 0.1891, + "loss": 0.1864, "step": 2737 }, { "epoch": 0.3745554035567715, - "grad_norm": 1.3701165529554247, + "grad_norm": 1.3487581701971116, "learning_rate": 6.919867395039652e-06, - "loss": 0.1803, + "loss": 0.1783, "step": 2738 }, { "epoch": 0.3746922024623803, - "grad_norm": 1.4505769508131938, + "grad_norm": 1.4605485163003922, "learning_rate": 6.917883106235376e-06, - "loss": 0.1943, + "loss": 0.1942, "step": 2739 }, { "epoch": 0.3748290013679891, - "grad_norm": 1.4406201294340246, + "grad_norm": 1.4597739276112647, "learning_rate": 6.9158984631996475e-06, - "loss": 0.2139, + "loss": 0.2119, "step": 2740 }, { "epoch": 0.3749658002735978, - "grad_norm": 1.3284571313130868, + "grad_norm": 1.32653052431558, "learning_rate": 6.91391346629903e-06, - "loss": 0.2253, + "loss": 0.2267, "step": 2741 }, { "epoch": 0.3751025991792066, - "grad_norm": 1.1551776247694574, + "grad_norm": 1.1733677909916278, "learning_rate": 6.911928115900151e-06, - "loss": 0.1862, + "loss": 0.1883, "step": 2742 }, { "epoch": 0.3752393980848153, - "grad_norm": 1.155647803142568, + "grad_norm": 1.1426402533683817, "learning_rate": 6.909942412369703e-06, - "loss": 0.1788, + "loss": 0.178, "step": 2743 }, { "epoch": 0.37537619699042407, - "grad_norm": 1.189096599385164, + "grad_norm": 1.172956989048201, "learning_rate": 6.907956356074443e-06, - "loss": 0.154, + "loss": 0.1534, "step": 2744 }, { "epoch": 0.3755129958960328, - "grad_norm": 1.2104431299728562, + "grad_norm": 1.1957614204010767, "learning_rate": 6.905969947381194e-06, - "loss": 0.1856, + "loss": 0.1837, "step": 2745 }, { "epoch": 0.37564979480164157, - "grad_norm": 1.0369490778591526, + "grad_norm": 1.026538848887849, "learning_rate": 6.9039831866568445e-06, - "loss": 0.165, + "loss": 0.1638, "step": 2746 }, { "epoch": 0.3757865937072503, - "grad_norm": 1.257436472628689, + "grad_norm": 1.2426091471142564, "learning_rate": 6.901996074268348e-06, - "loss": 0.1967, + "loss": 0.1954, "step": 2747 }, { "epoch": 0.3759233926128591, - "grad_norm": 1.3521788070128622, + "grad_norm": 1.32486036227102, "learning_rate": 6.90000861058272e-06, - "loss": 0.2222, + "loss": 0.2233, "step": 2748 }, { "epoch": 0.37606019151846787, - "grad_norm": 1.1931484683359241, + "grad_norm": 1.2231307728357375, "learning_rate": 6.898020795967049e-06, - "loss": 0.1863, + "loss": 0.1876, "step": 2749 }, { "epoch": 0.3761969904240766, - "grad_norm": 1.0930911812241282, + "grad_norm": 1.0918433758201371, "learning_rate": 6.896032630788476e-06, - "loss": 0.1655, + "loss": 0.166, "step": 2750 }, { "epoch": 0.37633378932968536, - "grad_norm": 1.3721223291539106, + "grad_norm": 1.3607598283208417, "learning_rate": 6.894044115414219e-06, - "loss": 0.2468, + "loss": 0.2476, "step": 2751 }, { "epoch": 0.3764705882352941, - "grad_norm": 1.3240840671854828, + "grad_norm": 1.2978174185866844, "learning_rate": 6.892055250211552e-06, - "loss": 0.1756, + "loss": 0.1733, "step": 2752 }, { "epoch": 0.37660738714090286, - "grad_norm": 1.3544646604818606, + "grad_norm": 1.35003989555592, "learning_rate": 6.890066035547819e-06, - "loss": 0.2099, + "loss": 0.2111, "step": 2753 }, { "epoch": 0.3767441860465116, - "grad_norm": 1.3274417868497592, + "grad_norm": 1.3593063436183173, "learning_rate": 6.888076471790423e-06, - "loss": 0.2251, + "loss": 0.2226, "step": 2754 }, { "epoch": 0.37688098495212036, - "grad_norm": 1.1588292215640275, + "grad_norm": 1.1503745250956077, "learning_rate": 6.8860865593068395e-06, - "loss": 0.1934, + "loss": 0.1918, "step": 2755 }, { "epoch": 0.37701778385772916, - "grad_norm": 1.3425912693499318, + "grad_norm": 1.3264186196998693, "learning_rate": 6.884096298464597e-06, "loss": 0.2031, "step": 2756 }, { "epoch": 0.3771545827633379, - "grad_norm": 1.1983504177385618, + "grad_norm": 1.179953423069347, "learning_rate": 6.8821056896313025e-06, - "loss": 0.1865, + "loss": 0.1871, "step": 2757 }, { "epoch": 0.37729138166894666, - "grad_norm": 0.9467431127621679, + "grad_norm": 0.9495494373397856, "learning_rate": 6.880114733174615e-06, - "loss": 0.1475, + "loss": 0.1483, "step": 2758 }, { "epoch": 0.3774281805745554, - "grad_norm": 1.2703011854398982, + "grad_norm": 1.2769581912506847, "learning_rate": 6.8781234294622644e-06, - "loss": 0.1963, + "loss": 0.1979, "step": 2759 }, { "epoch": 0.37756497948016415, - "grad_norm": 1.1862902595519007, + "grad_norm": 1.1749119787692968, "learning_rate": 6.876131778862044e-06, - "loss": 0.1772, + "loss": 0.1794, "step": 2760 }, { "epoch": 0.3777017783857729, - "grad_norm": 1.3034391937786984, + "grad_norm": 1.2891583779278162, "learning_rate": 6.874139781741808e-06, - "loss": 0.1729, + "loss": 0.1713, "step": 2761 }, { "epoch": 0.37783857729138165, - "grad_norm": 1.2157345738808534, + "grad_norm": 1.225655354580278, "learning_rate": 6.872147438469477e-06, - "loss": 0.1758, + "loss": 0.1798, "step": 2762 }, { "epoch": 0.3779753761969904, - "grad_norm": 1.090663458418277, + "grad_norm": 1.0987238928787035, "learning_rate": 6.870154749413033e-06, - "loss": 0.196, + "loss": 0.1962, "step": 2763 }, { "epoch": 0.3781121751025992, - "grad_norm": 1.1863416894384413, + "grad_norm": 1.2064425116025888, "learning_rate": 6.868161714940528e-06, - "loss": 0.209, + "loss": 0.2062, "step": 2764 }, { "epoch": 0.37824897400820795, - "grad_norm": 1.2482039901555178, + "grad_norm": 1.2483987254971989, "learning_rate": 6.866168335420073e-06, - "loss": 0.1887, + "loss": 0.19, "step": 2765 }, { "epoch": 0.3783857729138167, - "grad_norm": 1.6098768744431637, + "grad_norm": 1.567825860601201, "learning_rate": 6.864174611219841e-06, - "loss": 0.2211, + "loss": 0.2189, "step": 2766 }, { "epoch": 0.37852257181942545, - "grad_norm": 1.374817542921255, + "grad_norm": 1.369077841788501, "learning_rate": 6.8621805427080756e-06, - "loss": 0.232, + "loss": 0.2326, "step": 2767 }, { "epoch": 0.3786593707250342, - "grad_norm": 1.6351010364130332, + "grad_norm": 1.6322588368406146, "learning_rate": 6.860186130253076e-06, - "loss": 0.1903, + "loss": 0.1919, "step": 2768 }, { "epoch": 0.37879616963064294, - "grad_norm": 1.1676227063207525, + "grad_norm": 1.176139190931157, "learning_rate": 6.85819137422321e-06, - "loss": 0.1901, + "loss": 0.1928, "step": 2769 }, { "epoch": 0.3789329685362517, - "grad_norm": 0.9559812134981204, + "grad_norm": 0.96262651375877, "learning_rate": 6.856196274986907e-06, - "loss": 0.1654, + "loss": 0.1678, "step": 2770 }, { "epoch": 0.37906976744186044, - "grad_norm": 1.3117513572335284, + "grad_norm": 1.2586701754682854, "learning_rate": 6.854200832912659e-06, - "loss": 0.1814, + "loss": 0.1802, "step": 2771 }, { "epoch": 0.37920656634746924, - "grad_norm": 1.6393023810275285, + "grad_norm": 1.627357372750214, "learning_rate": 6.852205048369025e-06, - "loss": 0.2642, + "loss": 0.2658, "step": 2772 }, { "epoch": 0.379343365253078, - "grad_norm": 1.291879150390528, + "grad_norm": 1.2812111199567167, "learning_rate": 6.850208921724624e-06, - "loss": 0.1731, + "loss": 0.1757, "step": 2773 }, { "epoch": 0.37948016415868674, - "grad_norm": 1.182809057938539, + "grad_norm": 1.1654896314161753, "learning_rate": 6.848212453348137e-06, - "loss": 0.1759, + "loss": 0.1743, "step": 2774 }, { "epoch": 0.3796169630642955, - "grad_norm": 1.089324190429494, + "grad_norm": 1.1214238806720314, "learning_rate": 6.8462156436083135e-06, - "loss": 0.1742, + "loss": 0.1759, "step": 2775 }, { "epoch": 0.37975376196990424, - "grad_norm": 1.1543108792759782, + "grad_norm": 1.144129151067905, "learning_rate": 6.844218492873961e-06, - "loss": 0.1698, + "loss": 0.1687, "step": 2776 }, { "epoch": 0.379890560875513, - "grad_norm": 1.2753054269503095, + "grad_norm": 1.2654062378697934, "learning_rate": 6.84222100151395e-06, - "loss": 0.1918, + "loss": 0.1911, "step": 2777 }, { "epoch": 0.38002735978112173, - "grad_norm": 1.337551995892083, + "grad_norm": 1.3224727839460153, "learning_rate": 6.840223169897218e-06, - "loss": 0.2239, + "loss": 0.223, "step": 2778 }, { "epoch": 0.3801641586867305, - "grad_norm": 1.3757534658792556, + "grad_norm": 1.362272314995992, "learning_rate": 6.838224998392761e-06, - "loss": 0.2047, + "loss": 0.2062, "step": 2779 }, { "epoch": 0.3803009575923393, - "grad_norm": 1.2482542459095296, + "grad_norm": 1.2518723449911597, "learning_rate": 6.836226487369639e-06, - "loss": 0.2219, + "loss": 0.223, "step": 2780 }, { "epoch": 0.38043775649794803, - "grad_norm": 1.2756501171795296, + "grad_norm": 1.2603240828861724, "learning_rate": 6.83422763719698e-06, - "loss": 0.1937, + "loss": 0.1941, "step": 2781 }, { "epoch": 0.3805745554035568, - "grad_norm": 1.5520682763451106, + "grad_norm": 1.5496858644495488, "learning_rate": 6.832228448243965e-06, - "loss": 0.2385, + "loss": 0.2384, "step": 2782 }, { "epoch": 0.38071135430916553, - "grad_norm": 1.3696370510939369, + "grad_norm": 1.3529757146762038, "learning_rate": 6.8302289208798455e-06, - "loss": 0.2143, + "loss": 0.2119, "step": 2783 }, { "epoch": 0.3808481532147743, - "grad_norm": 1.5450832756400705, + "grad_norm": 1.5198130920558912, "learning_rate": 6.828229055473932e-06, - "loss": 0.2432, + "loss": 0.2439, "step": 2784 }, { "epoch": 0.380984952120383, - "grad_norm": 1.138289212847524, + "grad_norm": 1.1231772262828374, "learning_rate": 6.826228852395596e-06, - "loss": 0.1687, + "loss": 0.168, "step": 2785 }, { "epoch": 0.3811217510259918, - "grad_norm": 1.234033803912425, + "grad_norm": 1.2405655139011575, "learning_rate": 6.824228312014275e-06, - "loss": 0.1899, + "loss": 0.1914, "step": 2786 }, { "epoch": 0.3812585499316005, - "grad_norm": 1.4926512407161387, + "grad_norm": 1.443995216231367, "learning_rate": 6.822227434699468e-06, - "loss": 0.226, + "loss": 0.2226, "step": 2787 }, { "epoch": 0.3813953488372093, - "grad_norm": 1.1307000329382928, + "grad_norm": 1.1152248445670137, "learning_rate": 6.820226220820733e-06, - "loss": 0.1624, + "loss": 0.1611, "step": 2788 }, { "epoch": 0.3815321477428181, - "grad_norm": 1.6763480689842525, + "grad_norm": 1.6475601201356476, "learning_rate": 6.818224670747695e-06, - "loss": 0.2473, + "loss": 0.2466, "step": 2789 }, { "epoch": 0.3816689466484268, - "grad_norm": 1.3546085551775395, + "grad_norm": 1.3509358374017448, "learning_rate": 6.816222784850039e-06, - "loss": 0.233, + "loss": 0.2311, "step": 2790 }, { "epoch": 0.38180574555403557, - "grad_norm": 1.366230494561806, + "grad_norm": 1.3590977600118637, "learning_rate": 6.814220563497509e-06, - "loss": 0.2036, + "loss": 0.2015, "step": 2791 }, { "epoch": 0.3819425444596443, - "grad_norm": 1.1258865263439177, + "grad_norm": 1.1156314497203392, "learning_rate": 6.812218007059915e-06, - "loss": 0.179, + "loss": 0.1779, "step": 2792 }, { "epoch": 0.38207934336525307, - "grad_norm": 1.30570930352002, + "grad_norm": 1.2932958767989855, "learning_rate": 6.810215115907129e-06, - "loss": 0.2149, + "loss": 0.2143, "step": 2793 }, { "epoch": 0.3822161422708618, - "grad_norm": 1.1899614175024444, + "grad_norm": 1.18181813961475, "learning_rate": 6.80821189040908e-06, - "loss": 0.1987, + "loss": 0.1968, "step": 2794 }, { "epoch": 0.38235294117647056, - "grad_norm": 1.322072887815167, + "grad_norm": 1.2991902914908307, "learning_rate": 6.806208330935766e-06, - "loss": 0.2129, + "loss": 0.2089, "step": 2795 }, { "epoch": 0.38248974008207937, - "grad_norm": 1.4825191942498321, + "grad_norm": 1.4777661025259325, "learning_rate": 6.804204437857239e-06, - "loss": 0.2225, + "loss": 0.221, "step": 2796 }, { "epoch": 0.3826265389876881, - "grad_norm": 1.1627101822955084, + "grad_norm": 1.1419671285263777, "learning_rate": 6.8022002115436205e-06, - "loss": 0.174, + "loss": 0.172, "step": 2797 }, { "epoch": 0.38276333789329686, - "grad_norm": 1.1323540410041957, + "grad_norm": 1.1405646548942314, "learning_rate": 6.800195652365087e-06, - "loss": 0.1711, + "loss": 0.1718, "step": 2798 }, { "epoch": 0.3829001367989056, - "grad_norm": 1.4048713383144926, + "grad_norm": 1.4027448531697808, "learning_rate": 6.7981907606918785e-06, - "loss": 0.2038, + "loss": 0.2045, "step": 2799 }, { "epoch": 0.38303693570451436, - "grad_norm": 1.3550206060467078, + "grad_norm": 1.342893559974397, "learning_rate": 6.796185536894298e-06, - "loss": 0.2, + "loss": 0.1974, "step": 2800 }, { "epoch": 0.38303693570451436, - "eval_loss": 0.19436153769493103, - "eval_runtime": 5.9333, - "eval_samples_per_second": 5.056, - "eval_steps_per_second": 1.348, + "eval_loss": 0.19485409557819366, + "eval_runtime": 5.926, + "eval_samples_per_second": 5.062, + "eval_steps_per_second": 1.35, "step": 2800 }, { "epoch": 0.3831737346101231, - "grad_norm": 1.3638255843839786, + "grad_norm": 1.3303928285831816, "learning_rate": 6.794179981342708e-06, - "loss": 0.2501, + "loss": 0.2493, "step": 2801 }, { "epoch": 0.38331053351573185, - "grad_norm": 1.3996298834250642, + "grad_norm": 1.3620656631857053, "learning_rate": 6.792174094407533e-06, - "loss": 0.2367, + "loss": 0.2365, "step": 2802 }, { "epoch": 0.3834473324213406, - "grad_norm": 1.1314238836538655, + "grad_norm": 1.1488243595776733, "learning_rate": 6.79016787645926e-06, - "loss": 0.2049, + "loss": 0.206, "step": 2803 }, { "epoch": 0.3835841313269494, - "grad_norm": 1.5751605645032516, + "grad_norm": 1.5440515677649975, "learning_rate": 6.788161327868435e-06, - "loss": 0.2473, + "loss": 0.2468, "step": 2804 }, { "epoch": 0.38372093023255816, - "grad_norm": 1.5809641841562907, + "grad_norm": 1.5583299282357215, "learning_rate": 6.786154449005664e-06, - "loss": 0.2453, + "loss": 0.243, "step": 2805 }, { "epoch": 0.3838577291381669, - "grad_norm": 1.5013535496475898, + "grad_norm": 1.50068718341572, "learning_rate": 6.78414724024162e-06, - "loss": 0.2209, + "loss": 0.2226, "step": 2806 }, { "epoch": 0.38399452804377565, - "grad_norm": 1.2176416937823749, + "grad_norm": 1.2168465395780788, "learning_rate": 6.782139701947029e-06, - "loss": 0.2028, + "loss": 0.2036, "step": 2807 }, { "epoch": 0.3841313269493844, - "grad_norm": 1.4873443647176106, + "grad_norm": 1.4658681498495176, "learning_rate": 6.780131834492683e-06, - "loss": 0.2394, + "loss": 0.2384, "step": 2808 }, { "epoch": 0.38426812585499315, - "grad_norm": 1.3030940844090604, + "grad_norm": 1.280758184344728, "learning_rate": 6.778123638249434e-06, - "loss": 0.1811, + "loss": 0.1804, "step": 2809 }, { "epoch": 0.3844049247606019, - "grad_norm": 1.1642401599587302, + "grad_norm": 1.0889970845955468, "learning_rate": 6.776115113588195e-06, - "loss": 0.1728, + "loss": 0.171, "step": 2810 }, { "epoch": 0.38454172366621064, - "grad_norm": 1.2858703583103028, + "grad_norm": 1.2538770225970342, "learning_rate": 6.774106260879938e-06, - "loss": 0.1847, + "loss": 0.185, "step": 2811 }, { "epoch": 0.38467852257181945, - "grad_norm": 1.3297735605783036, + "grad_norm": 1.2865981525354968, "learning_rate": 6.772097080495694e-06, - "loss": 0.2104, + "loss": 0.2124, "step": 2812 }, { "epoch": 0.3848153214774282, - "grad_norm": 1.3976682158693725, + "grad_norm": 1.389188986260412, "learning_rate": 6.770087572806561e-06, "loss": 0.1984, "step": 2813 }, { "epoch": 0.38495212038303694, - "grad_norm": 1.110913901598874, + "grad_norm": 1.0773338381675042, "learning_rate": 6.76807773818369e-06, - "loss": 0.1705, + "loss": 0.1697, "step": 2814 }, { "epoch": 0.3850889192886457, - "grad_norm": 1.272658672225736, + "grad_norm": 1.2562291332747153, "learning_rate": 6.766067576998297e-06, - "loss": 0.1825, + "loss": 0.1832, "step": 2815 }, { "epoch": 0.38522571819425444, - "grad_norm": 1.2231627813153014, + "grad_norm": 1.2319659201240434, "learning_rate": 6.764057089621661e-06, - "loss": 0.1599, + "loss": 0.1594, "step": 2816 }, { "epoch": 0.3853625170998632, - "grad_norm": 1.2347462139886183, + "grad_norm": 1.249658709258947, "learning_rate": 6.762046276425112e-06, - "loss": 0.2029, + "loss": 0.2035, "step": 2817 }, { "epoch": 0.38549931600547194, - "grad_norm": 1.5383149551070479, + "grad_norm": 1.509351030865843, "learning_rate": 6.7600351377800466e-06, - "loss": 0.2537, + "loss": 0.2521, "step": 2818 }, { "epoch": 0.3856361149110807, - "grad_norm": 1.183403983113214, + "grad_norm": 1.1639631780725963, "learning_rate": 6.758023674057923e-06, - "loss": 0.1681, + "loss": 0.1682, "step": 2819 }, { "epoch": 0.3857729138166895, - "grad_norm": 1.5284475751979236, + "grad_norm": 1.5254869563084805, "learning_rate": 6.7560118856302535e-06, - "loss": 0.2112, + "loss": 0.2138, "step": 2820 }, { "epoch": 0.38590971272229824, - "grad_norm": 1.3278936148699838, + "grad_norm": 1.3012923915597712, "learning_rate": 6.753999772868617e-06, - "loss": 0.2033, + "loss": 0.2023, "step": 2821 }, { "epoch": 0.386046511627907, - "grad_norm": 1.2933194261595733, + "grad_norm": 1.287611128808451, "learning_rate": 6.7519873361446475e-06, - "loss": 0.2007, + "loss": 0.2005, "step": 2822 }, { "epoch": 0.38618331053351573, - "grad_norm": 1.3964214155717514, + "grad_norm": 1.3402791670059602, "learning_rate": 6.74997457583004e-06, - "loss": 0.2212, + "loss": 0.2178, "step": 2823 }, { "epoch": 0.3863201094391245, - "grad_norm": 1.4075707477123436, + "grad_norm": 1.3397086910469385, "learning_rate": 6.7479614922965506e-06, - "loss": 0.2142, + "loss": 0.2116, "step": 2824 }, { "epoch": 0.38645690834473323, - "grad_norm": 1.2085544330306024, + "grad_norm": 1.1784072851175535, "learning_rate": 6.745948085915996e-06, - "loss": 0.1628, + "loss": 0.161, "step": 2825 }, { "epoch": 0.386593707250342, - "grad_norm": 1.6644580485278961, + "grad_norm": 1.6465415962657084, "learning_rate": 6.743934357060247e-06, - "loss": 0.2843, + "loss": 0.292, "step": 2826 }, { "epoch": 0.3867305061559507, - "grad_norm": 1.3203311324746057, + "grad_norm": 1.2868962046796257, "learning_rate": 6.741920306101239e-06, - "loss": 0.222, + "loss": 0.2164, "step": 2827 }, { "epoch": 0.38686730506155953, - "grad_norm": 1.0606894735532484, + "grad_norm": 1.0506611107610633, "learning_rate": 6.739905933410965e-06, - "loss": 0.1799, + "loss": 0.1806, "step": 2828 }, { "epoch": 0.3870041039671683, - "grad_norm": 1.176435268275883, + "grad_norm": 1.1452143652661386, "learning_rate": 6.73789123936148e-06, - "loss": 0.1718, + "loss": 0.1717, "step": 2829 }, { "epoch": 0.387140902872777, - "grad_norm": 1.3483138069319331, + "grad_norm": 1.2251008807634711, "learning_rate": 6.735876224324895e-06, - "loss": 0.2213, + "loss": 0.2179, "step": 2830 }, { "epoch": 0.3872777017783858, - "grad_norm": 1.3189887548446178, + "grad_norm": 1.3023767866335338, "learning_rate": 6.733860888673382e-06, - "loss": 0.1779, + "loss": 0.1768, "step": 2831 }, { "epoch": 0.3874145006839945, - "grad_norm": 1.414216440638557, + "grad_norm": 1.4261307063325357, "learning_rate": 6.731845232779173e-06, - "loss": 0.2222, + "loss": 0.2221, "step": 2832 }, { "epoch": 0.38755129958960327, - "grad_norm": 1.3525564175441713, + "grad_norm": 1.331385600751386, "learning_rate": 6.7298292570145555e-06, - "loss": 0.2313, + "loss": 0.2301, "step": 2833 }, { "epoch": 0.387688098495212, - "grad_norm": 1.0812180044010207, + "grad_norm": 1.0619963987651742, "learning_rate": 6.727812961751881e-06, - "loss": 0.1859, + "loss": 0.1843, "step": 2834 }, { "epoch": 0.38782489740082077, - "grad_norm": 1.394803188049814, + "grad_norm": 1.360258064540061, "learning_rate": 6.725796347363555e-06, - "loss": 0.1909, + "loss": 0.1895, "step": 2835 }, { "epoch": 0.38796169630642957, - "grad_norm": 1.1185804401586446, + "grad_norm": 1.1878170144931786, "learning_rate": 6.723779414222045e-06, - "loss": 0.1757, + "loss": 0.1785, "step": 2836 }, { "epoch": 0.3880984952120383, - "grad_norm": 1.4754292133007172, + "grad_norm": 1.4207220281765456, "learning_rate": 6.72176216269988e-06, - "loss": 0.2291, + "loss": 0.2269, "step": 2837 }, { "epoch": 0.38823529411764707, - "grad_norm": 1.186533116621521, + "grad_norm": 1.18505428772185, "learning_rate": 6.719744593169642e-06, - "loss": 0.1635, + "loss": 0.1623, "step": 2838 }, { "epoch": 0.3883720930232558, - "grad_norm": 1.2088187725550175, + "grad_norm": 1.2018221800883282, "learning_rate": 6.7177267060039745e-06, - "loss": 0.1796, + "loss": 0.1811, "step": 2839 }, { "epoch": 0.38850889192886456, - "grad_norm": 1.6063755006831124, + "grad_norm": 1.5735474244928214, "learning_rate": 6.715708501575581e-06, - "loss": 0.277, + "loss": 0.2783, "step": 2840 }, { "epoch": 0.3886456908344733, - "grad_norm": 1.2283746339906378, + "grad_norm": 1.2085874505286964, "learning_rate": 6.7136899802572205e-06, - "loss": 0.2106, + "loss": 0.2097, "step": 2841 }, { "epoch": 0.38878248974008206, - "grad_norm": 1.3674536879225687, + "grad_norm": 1.375048877409534, "learning_rate": 6.711671142421714e-06, - "loss": 0.1894, + "loss": 0.1903, "step": 2842 }, { "epoch": 0.3889192886456908, - "grad_norm": 1.3362148600023487, + "grad_norm": 1.3270210328763647, "learning_rate": 6.709651988441939e-06, - "loss": 0.2165, + "loss": 0.2148, "step": 2843 }, { "epoch": 0.3890560875512996, - "grad_norm": 1.4269681034781638, + "grad_norm": 1.403279602878444, "learning_rate": 6.707632518690829e-06, - "loss": 0.2353, + "loss": 0.235, "step": 2844 }, { "epoch": 0.38919288645690836, - "grad_norm": 1.2540593164818818, + "grad_norm": 1.2380605332364976, "learning_rate": 6.70561273354138e-06, - "loss": 0.2285, + "loss": 0.2281, "step": 2845 }, { "epoch": 0.3893296853625171, - "grad_norm": 1.2186383319079002, + "grad_norm": 1.1838960123203082, "learning_rate": 6.703592633366647e-06, - "loss": 0.1766, + "loss": 0.1752, "step": 2846 }, { "epoch": 0.38946648426812586, - "grad_norm": 1.3402907343566703, + "grad_norm": 1.3300917579025273, "learning_rate": 6.701572218539739e-06, - "loss": 0.213, + "loss": 0.2098, "step": 2847 }, { "epoch": 0.3896032831737346, - "grad_norm": 1.1913946464517389, + "grad_norm": 1.185669843246001, "learning_rate": 6.699551489433824e-06, - "loss": 0.191, + "loss": 0.1901, "step": 2848 }, { "epoch": 0.38974008207934335, - "grad_norm": 1.1084602007606157, + "grad_norm": 1.1075115242267233, "learning_rate": 6.697530446422131e-06, - "loss": 0.178, + "loss": 0.1784, "step": 2849 }, { "epoch": 0.3898768809849521, - "grad_norm": 1.1799402954224933, + "grad_norm": 1.1823643414925182, "learning_rate": 6.695509089877943e-06, - "loss": 0.2102, + "loss": 0.2106, "step": 2850 }, { "epoch": 0.39001367989056085, - "grad_norm": 1.061595449411827, + "grad_norm": 1.0637823548278351, "learning_rate": 6.693487420174604e-06, - "loss": 0.1597, + "loss": 0.1593, "step": 2851 }, { "epoch": 0.39015047879616965, - "grad_norm": 1.2246165562007338, + "grad_norm": 1.2202704766822245, "learning_rate": 6.691465437685514e-06, - "loss": 0.185, + "loss": 0.1842, "step": 2852 }, { "epoch": 0.3902872777017784, - "grad_norm": 1.1994376738346808, + "grad_norm": 1.184299615189477, "learning_rate": 6.689443142784132e-06, - "loss": 0.1977, + "loss": 0.1987, "step": 2853 }, { "epoch": 0.39042407660738715, - "grad_norm": 1.2736628591685542, + "grad_norm": 1.260495241469219, "learning_rate": 6.6874205358439745e-06, - "loss": 0.1563, + "loss": 0.1571, "step": 2854 }, { "epoch": 0.3905608755129959, - "grad_norm": 1.0651225209627393, + "grad_norm": 1.0675166365782296, "learning_rate": 6.685397617238616e-06, - "loss": 0.1825, + "loss": 0.1834, "step": 2855 }, { "epoch": 0.39069767441860465, - "grad_norm": 1.4359509024098323, + "grad_norm": 1.4614700702241026, "learning_rate": 6.683374387341688e-06, - "loss": 0.2375, + "loss": 0.2372, "step": 2856 }, { "epoch": 0.3908344733242134, - "grad_norm": 1.4204996976714823, + "grad_norm": 1.423852480064297, "learning_rate": 6.681350846526877e-06, - "loss": 0.1966, + "loss": 0.1969, "step": 2857 }, { "epoch": 0.39097127222982214, - "grad_norm": 1.2291453562853374, + "grad_norm": 1.2035613819109021, "learning_rate": 6.679326995167931e-06, - "loss": 0.1822, + "loss": 0.1832, "step": 2858 }, { "epoch": 0.3911080711354309, - "grad_norm": 1.2360241131329992, + "grad_norm": 1.2226287584570399, "learning_rate": 6.677302833638653e-06, - "loss": 0.2092, + "loss": 0.2058, "step": 2859 }, { "epoch": 0.3912448700410397, - "grad_norm": 1.3951484879324978, + "grad_norm": 1.3869995260788082, "learning_rate": 6.6752783623129045e-06, - "loss": 0.2153, + "loss": 0.2155, "step": 2860 }, { "epoch": 0.39138166894664844, - "grad_norm": 1.2612358878029577, + "grad_norm": 1.2543961182395973, "learning_rate": 6.673253581564605e-06, - "loss": 0.2256, + "loss": 0.2251, "step": 2861 }, { "epoch": 0.3915184678522572, - "grad_norm": 1.2177437695398499, + "grad_norm": 1.1756811377420837, "learning_rate": 6.6712284917677285e-06, - "loss": 0.2028, + "loss": 0.1983, "step": 2862 }, { "epoch": 0.39165526675786594, - "grad_norm": 1.3138813997034302, + "grad_norm": 1.3068508406931194, "learning_rate": 6.669203093296307e-06, - "loss": 0.2241, + "loss": 0.224, "step": 2863 }, { "epoch": 0.3917920656634747, - "grad_norm": 1.3258729246396197, + "grad_norm": 1.3166153079307599, "learning_rate": 6.667177386524431e-06, - "loss": 0.2251, + "loss": 0.2238, "step": 2864 }, { "epoch": 0.39192886456908343, - "grad_norm": 1.394179325069993, + "grad_norm": 1.3909401349350674, "learning_rate": 6.665151371826246e-06, - "loss": 0.2423, + "loss": 0.241, "step": 2865 }, { "epoch": 0.3920656634746922, - "grad_norm": 1.212935792269944, + "grad_norm": 1.2340807703526304, "learning_rate": 6.663125049575956e-06, - "loss": 0.1953, + "loss": 0.194, "step": 2866 }, { "epoch": 0.39220246238030093, - "grad_norm": 1.3204279911171972, + "grad_norm": 1.3180373194690218, "learning_rate": 6.66109842014782e-06, - "loss": 0.199, + "loss": 0.2003, "step": 2867 }, { "epoch": 0.39233926128590974, - "grad_norm": 1.206470601722092, + "grad_norm": 1.1852888159312451, "learning_rate": 6.659071483916155e-06, - "loss": 0.19, + "loss": 0.1893, "step": 2868 }, { "epoch": 0.3924760601915185, - "grad_norm": 1.2010328784681021, + "grad_norm": 1.2064922015726192, "learning_rate": 6.657044241255337e-06, - "loss": 0.2128, + "loss": 0.2172, "step": 2869 }, { "epoch": 0.39261285909712723, - "grad_norm": 1.127636956549922, + "grad_norm": 1.1220499851075343, "learning_rate": 6.655016692539794e-06, - "loss": 0.1801, + "loss": 0.1816, "step": 2870 }, { "epoch": 0.392749658002736, - "grad_norm": 1.332083400590513, + "grad_norm": 1.3396819055935867, "learning_rate": 6.652988838144013e-06, - "loss": 0.1893, + "loss": 0.1898, "step": 2871 }, { "epoch": 0.39288645690834473, - "grad_norm": 1.1578887411322838, + "grad_norm": 1.1527001346492842, "learning_rate": 6.650960678442535e-06, - "loss": 0.1842, + "loss": 0.1796, "step": 2872 }, { "epoch": 0.3930232558139535, - "grad_norm": 1.13708529648217, + "grad_norm": 1.1220679293651619, "learning_rate": 6.648932213809962e-06, "loss": 0.1708, "step": 2873 }, { "epoch": 0.3931600547195622, - "grad_norm": 1.3918372305040976, + "grad_norm": 1.382620446765909, "learning_rate": 6.646903444620949e-06, - "loss": 0.2365, + "loss": 0.2351, "step": 2874 }, { "epoch": 0.393296853625171, - "grad_norm": 1.1917317288744573, + "grad_norm": 1.199445019291797, "learning_rate": 6.644874371250208e-06, - "loss": 0.1926, + "loss": 0.1936, "step": 2875 }, { "epoch": 0.3934336525307798, - "grad_norm": 1.2009322165322622, + "grad_norm": 1.193842740621373, "learning_rate": 6.642844994072506e-06, - "loss": 0.1964, + "loss": 0.197, "step": 2876 }, { "epoch": 0.3935704514363885, - "grad_norm": 1.3643375860365015, + "grad_norm": 1.3680640109107371, "learning_rate": 6.6408153134626705e-06, "loss": 0.2116, "step": 2877 }, { "epoch": 0.3937072503419973, - "grad_norm": 1.4219395147619829, + "grad_norm": 1.3805227143445848, "learning_rate": 6.638785329795579e-06, - "loss": 0.2332, + "loss": 0.2299, "step": 2878 }, { "epoch": 0.393844049247606, - "grad_norm": 1.091021786958043, + "grad_norm": 1.1080386449959605, "learning_rate": 6.636755043446169e-06, - "loss": 0.1833, + "loss": 0.1836, "step": 2879 }, { "epoch": 0.39398084815321477, - "grad_norm": 1.277967598212737, + "grad_norm": 1.278174574181825, "learning_rate": 6.634724454789433e-06, - "loss": 0.1824, + "loss": 0.1856, "step": 2880 }, { "epoch": 0.3941176470588235, - "grad_norm": 1.1614213415791466, + "grad_norm": 1.187919419589184, "learning_rate": 6.6326935642004165e-06, - "loss": 0.2175, + "loss": 0.221, "step": 2881 }, { "epoch": 0.39425444596443227, - "grad_norm": 1.0575907336147807, + "grad_norm": 1.0551129812619817, "learning_rate": 6.630662372054227e-06, - "loss": 0.1899, + "loss": 0.1905, "step": 2882 }, { "epoch": 0.394391244870041, - "grad_norm": 1.4865277466314295, + "grad_norm": 1.3763799299151458, "learning_rate": 6.628630878726023e-06, - "loss": 0.2064, + "loss": 0.1996, "step": 2883 }, { "epoch": 0.3945280437756498, - "grad_norm": 1.2237831906312377, + "grad_norm": 1.2082585726391686, "learning_rate": 6.626599084591016e-06, - "loss": 0.23, + "loss": 0.2275, "step": 2884 }, { "epoch": 0.39466484268125857, - "grad_norm": 1.3916671250395825, + "grad_norm": 1.3680429295150105, "learning_rate": 6.6245669900244835e-06, - "loss": 0.2159, + "loss": 0.218, "step": 2885 }, { "epoch": 0.3948016415868673, - "grad_norm": 1.2354927474198472, + "grad_norm": 1.2292352105524817, "learning_rate": 6.622534595401748e-06, - "loss": 0.2021, + "loss": 0.2029, "step": 2886 }, { "epoch": 0.39493844049247606, - "grad_norm": 1.5857082345302878, + "grad_norm": 1.5711751859123875, "learning_rate": 6.6205019010981885e-06, - "loss": 0.2328, + "loss": 0.2331, "step": 2887 }, { "epoch": 0.3950752393980848, - "grad_norm": 1.304310000014774, + "grad_norm": 1.2995144363613496, "learning_rate": 6.6184689074892464e-06, - "loss": 0.1779, + "loss": 0.1782, "step": 2888 }, { "epoch": 0.39521203830369356, - "grad_norm": 1.3807954007950172, + "grad_norm": 1.3543406120669668, "learning_rate": 6.616435614950411e-06, - "loss": 0.2117, + "loss": 0.2135, "step": 2889 }, { "epoch": 0.3953488372093023, - "grad_norm": 1.2268371233292916, + "grad_norm": 1.2019456895863712, "learning_rate": 6.614402023857231e-06, - "loss": 0.2135, + "loss": 0.2124, "step": 2890 }, { "epoch": 0.39548563611491105, - "grad_norm": 1.2793339025650774, + "grad_norm": 1.2567899388734651, "learning_rate": 6.612368134585311e-06, - "loss": 0.1728, + "loss": 0.1737, "step": 2891 }, { "epoch": 0.39562243502051986, - "grad_norm": 1.313413960817591, + "grad_norm": 1.2835304786172366, "learning_rate": 6.610333947510305e-06, - "loss": 0.21, + "loss": 0.2101, "step": 2892 }, { "epoch": 0.3957592339261286, - "grad_norm": 1.1858329640784762, + "grad_norm": 1.183234285357985, "learning_rate": 6.6082994630079276e-06, - "loss": 0.1838, + "loss": 0.1854, "step": 2893 }, { "epoch": 0.39589603283173735, - "grad_norm": 1.0926474546996088, + "grad_norm": 1.0699408652156017, "learning_rate": 6.606264681453947e-06, - "loss": 0.1835, + "loss": 0.1822, "step": 2894 }, { "epoch": 0.3960328317373461, - "grad_norm": 1.3330767286774732, + "grad_norm": 1.3118007710607802, "learning_rate": 6.6042296032241835e-06, - "loss": 0.1969, + "loss": 0.1936, "step": 2895 }, { "epoch": 0.39616963064295485, - "grad_norm": 1.2420783625619538, + "grad_norm": 1.2568811710766372, "learning_rate": 6.602194228694516e-06, - "loss": 0.1938, + "loss": 0.1952, "step": 2896 }, { "epoch": 0.3963064295485636, - "grad_norm": 1.2541154107790706, + "grad_norm": 1.2155437072637607, "learning_rate": 6.600158558240878e-06, - "loss": 0.2032, + "loss": 0.1988, "step": 2897 }, { "epoch": 0.39644322845417235, - "grad_norm": 1.287340217148119, + "grad_norm": 1.271167662504114, "learning_rate": 6.598122592239255e-06, - "loss": 0.2038, + "loss": 0.2027, "step": 2898 }, { "epoch": 0.3965800273597811, - "grad_norm": 1.4394346950818389, + "grad_norm": 1.4251140606572708, "learning_rate": 6.596086331065686e-06, - "loss": 0.2256, + "loss": 0.2254, "step": 2899 }, { "epoch": 0.3967168262653899, - "grad_norm": 1.4085839423221427, + "grad_norm": 1.430079825197043, "learning_rate": 6.594049775096268e-06, - "loss": 0.2057, + "loss": 0.206, "step": 2900 }, { "epoch": 0.3967168262653899, - "eval_loss": 0.19571372866630554, - "eval_runtime": 5.9202, - "eval_samples_per_second": 5.067, - "eval_steps_per_second": 1.351, + "eval_loss": 0.19530761241912842, + "eval_runtime": 5.9386, + "eval_samples_per_second": 5.052, + "eval_steps_per_second": 1.347, "step": 2900 }, { "epoch": 0.39685362517099865, - "grad_norm": 1.1436435998226975, + "grad_norm": 1.1188909405719185, "learning_rate": 6.592012924707153e-06, - "loss": 0.1834, + "loss": 0.1809, "step": 2901 }, { "epoch": 0.3969904240766074, - "grad_norm": 1.4186210885025, + "grad_norm": 1.4157912911465114, "learning_rate": 6.589975780274545e-06, - "loss": 0.1666, + "loss": 0.17, "step": 2902 }, { "epoch": 0.39712722298221614, - "grad_norm": 1.4693599696312944, + "grad_norm": 1.5434194902511995, "learning_rate": 6.5879383421747e-06, - "loss": 0.2095, + "loss": 0.2118, "step": 2903 }, { "epoch": 0.3972640218878249, - "grad_norm": 1.2768409142707113, + "grad_norm": 1.246299145542028, "learning_rate": 6.585900610783936e-06, - "loss": 0.2045, + "loss": 0.2024, "step": 2904 }, { "epoch": 0.39740082079343364, - "grad_norm": 1.2819383454335533, + "grad_norm": 1.2605301297769804, "learning_rate": 6.583862586478618e-06, - "loss": 0.1957, + "loss": 0.1955, "step": 2905 }, { "epoch": 0.3975376196990424, - "grad_norm": 1.4940093670128172, + "grad_norm": 1.4605629202256005, "learning_rate": 6.581824269635166e-06, - "loss": 0.2243, + "loss": 0.2218, "step": 2906 }, { "epoch": 0.39767441860465114, - "grad_norm": 1.2605008828441682, + "grad_norm": 1.2433415807617447, "learning_rate": 6.579785660630057e-06, - "loss": 0.1925, + "loss": 0.1924, "step": 2907 }, { "epoch": 0.39781121751025994, - "grad_norm": 1.246885251859826, + "grad_norm": 1.2283423396564042, "learning_rate": 6.57774675983982e-06, - "loss": 0.1807, + "loss": 0.1795, "step": 2908 }, { "epoch": 0.3979480164158687, - "grad_norm": 1.2540004350048042, + "grad_norm": 1.2567965655932642, "learning_rate": 6.575707567641038e-06, - "loss": 0.1611, + "loss": 0.1614, "step": 2909 }, { "epoch": 0.39808481532147744, - "grad_norm": 1.0771647550929504, + "grad_norm": 1.0772672429272823, "learning_rate": 6.57366808441035e-06, - "loss": 0.1805, + "loss": 0.181, "step": 2910 }, { "epoch": 0.3982216142270862, - "grad_norm": 1.103479055527922, + "grad_norm": 1.135065944994057, "learning_rate": 6.571628310524445e-06, - "loss": 0.1817, + "loss": 0.188, "step": 2911 }, { "epoch": 0.39835841313269493, - "grad_norm": 1.1192180966722793, + "grad_norm": 1.122429328535754, "learning_rate": 6.569588246360068e-06, - "loss": 0.1727, + "loss": 0.174, "step": 2912 }, { "epoch": 0.3984952120383037, - "grad_norm": 1.1337629520498431, + "grad_norm": 1.1237466382494827, "learning_rate": 6.567547892294017e-06, - "loss": 0.1865, + "loss": 0.187, "step": 2913 }, { "epoch": 0.39863201094391243, - "grad_norm": 1.281169414772251, + "grad_norm": 1.2569911803091063, "learning_rate": 6.565507248703144e-06, - "loss": 0.1817, + "loss": 0.1795, "step": 2914 }, { "epoch": 0.3987688098495212, - "grad_norm": 1.332712139685438, + "grad_norm": 1.3191155508076726, "learning_rate": 6.563466315964355e-06, - "loss": 0.1948, + "loss": 0.1945, "step": 2915 }, { "epoch": 0.39890560875513, - "grad_norm": 1.4086359277317098, + "grad_norm": 1.404109581110005, "learning_rate": 6.561425094454608e-06, - "loss": 0.204, + "loss": 0.2047, "step": 2916 }, { "epoch": 0.39904240766073873, - "grad_norm": 1.1870546466366232, + "grad_norm": 1.1618719806227173, "learning_rate": 6.559383584550914e-06, - "loss": 0.1487, + "loss": 0.1521, "step": 2917 }, { "epoch": 0.3991792065663475, - "grad_norm": 1.277051167359252, + "grad_norm": 1.2822204342284147, "learning_rate": 6.557341786630339e-06, - "loss": 0.213, + "loss": 0.2148, "step": 2918 }, { "epoch": 0.3993160054719562, - "grad_norm": 1.4422795216525406, + "grad_norm": 1.3858081761047327, "learning_rate": 6.555299701070002e-06, - "loss": 0.2339, + "loss": 0.2304, "step": 2919 }, { "epoch": 0.399452804377565, - "grad_norm": 1.3363530578587626, + "grad_norm": 1.319416145642371, "learning_rate": 6.553257328247073e-06, - "loss": 0.1848, + "loss": 0.1852, "step": 2920 }, { "epoch": 0.3995896032831737, - "grad_norm": 1.3993371768828586, + "grad_norm": 1.3736283797505666, "learning_rate": 6.55121466853878e-06, "loss": 0.2288, "step": 2921 }, { "epoch": 0.39972640218878247, - "grad_norm": 1.4926099333940719, + "grad_norm": 1.448233183039607, "learning_rate": 6.5491717223223964e-06, - "loss": 0.2244, + "loss": 0.2249, "step": 2922 }, { "epoch": 0.3998632010943912, - "grad_norm": 1.3425417588497142, + "grad_norm": 1.3286395731308744, "learning_rate": 6.547128489975255e-06, - "loss": 0.2083, + "loss": 0.2069, "step": 2923 }, { "epoch": 0.4, - "grad_norm": 1.0561423336804867, + "grad_norm": 1.0445793415674167, "learning_rate": 6.545084971874738e-06, - "loss": 0.1915, + "loss": 0.1923, "step": 2924 }, { "epoch": 0.40013679890560877, - "grad_norm": 0.988394012086382, + "grad_norm": 0.9837103686768615, "learning_rate": 6.5430411683982835e-06, - "loss": 0.1727, + "loss": 0.1745, "step": 2925 }, { "epoch": 0.4002735978112175, - "grad_norm": 1.4515889397993464, + "grad_norm": 1.429001406401407, "learning_rate": 6.540997079923377e-06, "loss": 0.2357, "step": 2926 }, { "epoch": 0.40041039671682627, - "grad_norm": 1.2501967647314578, + "grad_norm": 1.2284051145722898, "learning_rate": 6.538952706827563e-06, - "loss": 0.1605, + "loss": 0.1606, "step": 2927 }, { "epoch": 0.400547195622435, - "grad_norm": 1.3496104347026348, + "grad_norm": 1.312633303921706, "learning_rate": 6.5369080494884355e-06, - "loss": 0.1997, + "loss": 0.1992, "step": 2928 }, { "epoch": 0.40068399452804376, - "grad_norm": 1.182630299693359, + "grad_norm": 1.1742065910973782, "learning_rate": 6.534863108283639e-06, - "loss": 0.1705, + "loss": 0.1714, "step": 2929 }, { "epoch": 0.4008207934336525, - "grad_norm": 1.2745684503743153, + "grad_norm": 1.26613690503334, "learning_rate": 6.532817883590874e-06, - "loss": 0.1976, + "loss": 0.1989, "step": 2930 }, { "epoch": 0.40095759233926126, - "grad_norm": 1.0164058019757556, + "grad_norm": 0.9889509858412627, "learning_rate": 6.530772375787892e-06, - "loss": 0.1715, + "loss": 0.1692, "step": 2931 }, { "epoch": 0.40109439124487006, - "grad_norm": 1.369249969444141, + "grad_norm": 1.3461842341024524, "learning_rate": 6.528726585252494e-06, - "loss": 0.2287, + "loss": 0.232, "step": 2932 }, { "epoch": 0.4012311901504788, - "grad_norm": 1.1501618551257804, + "grad_norm": 1.1356758628586283, "learning_rate": 6.526680512362539e-06, - "loss": 0.1971, + "loss": 0.1974, "step": 2933 }, { "epoch": 0.40136798905608756, - "grad_norm": 1.4312461083182664, + "grad_norm": 1.4044082335917656, "learning_rate": 6.524634157495935e-06, - "loss": 0.2021, + "loss": 0.2032, "step": 2934 }, { "epoch": 0.4015047879616963, - "grad_norm": 1.1563696181065746, + "grad_norm": 1.1815090381627407, "learning_rate": 6.52258752103064e-06, - "loss": 0.1841, + "loss": 0.1884, "step": 2935 }, { "epoch": 0.40164158686730506, - "grad_norm": 1.405009270319102, + "grad_norm": 1.8064716308832196, "learning_rate": 6.5205406033446675e-06, - "loss": 0.1883, + "loss": 0.192, "step": 2936 }, { "epoch": 0.4017783857729138, - "grad_norm": 1.2753603805284977, + "grad_norm": 1.2311267890043014, "learning_rate": 6.518493404816082e-06, - "loss": 0.2005, + "loss": 0.1982, "step": 2937 }, { "epoch": 0.40191518467852255, - "grad_norm": 1.2889813527900125, + "grad_norm": 1.2770068586759875, "learning_rate": 6.516445925822997e-06, - "loss": 0.2128, + "loss": 0.2114, "step": 2938 }, { "epoch": 0.4020519835841313, - "grad_norm": 1.2573807959351477, + "grad_norm": 1.217173796058278, "learning_rate": 6.514398166743581e-06, - "loss": 0.1826, + "loss": 0.1825, "step": 2939 }, { "epoch": 0.4021887824897401, - "grad_norm": 1.6326034441269224, + "grad_norm": 1.5834040140507897, "learning_rate": 6.512350127956055e-06, - "loss": 0.2426, + "loss": 0.2431, "step": 2940 }, { "epoch": 0.40232558139534885, - "grad_norm": 1.1073319796577672, + "grad_norm": 1.0811184004946695, "learning_rate": 6.510301809838689e-06, - "loss": 0.2063, + "loss": 0.2066, "step": 2941 }, { "epoch": 0.4024623803009576, - "grad_norm": 1.0703442603840319, + "grad_norm": 1.056338875661878, "learning_rate": 6.508253212769807e-06, - "loss": 0.1624, + "loss": 0.1619, "step": 2942 }, { "epoch": 0.40259917920656635, - "grad_norm": 1.2663748203185603, + "grad_norm": 1.2515890413564068, "learning_rate": 6.506204337127783e-06, - "loss": 0.2233, + "loss": 0.2242, "step": 2943 }, { "epoch": 0.4027359781121751, - "grad_norm": 1.2870538170886363, + "grad_norm": 1.2751886811434063, "learning_rate": 6.504155183291041e-06, - "loss": 0.1822, + "loss": 0.186, "step": 2944 }, { "epoch": 0.40287277701778385, - "grad_norm": 1.1484864300383806, + "grad_norm": 1.1279402679810766, "learning_rate": 6.502105751638058e-06, - "loss": 0.2063, + "loss": 0.2056, "step": 2945 }, { "epoch": 0.4030095759233926, - "grad_norm": 1.2339683693463128, + "grad_norm": 1.2094946783539529, "learning_rate": 6.500056042547365e-06, - "loss": 0.1902, + "loss": 0.1881, "step": 2946 }, { "epoch": 0.40314637482900134, - "grad_norm": 1.16983506203796, + "grad_norm": 1.1567093887699706, "learning_rate": 6.4980060563975365e-06, - "loss": 0.1941, + "loss": 0.1938, "step": 2947 }, { "epoch": 0.40328317373461015, - "grad_norm": 1.1617237536217442, + "grad_norm": 1.157138425846471, "learning_rate": 6.495955793567208e-06, - "loss": 0.1645, + "loss": 0.1666, "step": 2948 }, { "epoch": 0.4034199726402189, - "grad_norm": 1.426639720013809, + "grad_norm": 1.4096960719264455, "learning_rate": 6.493905254435061e-06, - "loss": 0.2166, + "loss": 0.2182, "step": 2949 }, { "epoch": 0.40355677154582764, - "grad_norm": 0.9623952659348054, + "grad_norm": 0.9483648640236889, "learning_rate": 6.491854439379827e-06, - "loss": 0.1557, + "loss": 0.1566, "step": 2950 }, { "epoch": 0.4036935704514364, - "grad_norm": 1.0764017820113245, + "grad_norm": 1.0533644321874145, "learning_rate": 6.48980334878029e-06, - "loss": 0.1759, + "loss": 0.1734, "step": 2951 }, { "epoch": 0.40383036935704514, - "grad_norm": 1.280764795074373, + "grad_norm": 1.2708763719366811, "learning_rate": 6.487751983015287e-06, - "loss": 0.2034, + "loss": 0.2031, "step": 2952 }, { "epoch": 0.4039671682626539, - "grad_norm": 1.3728975516094473, + "grad_norm": 1.3102142822703464, "learning_rate": 6.4857003424637e-06, - "loss": 0.2547, + "loss": 0.2498, "step": 2953 }, { "epoch": 0.40410396716826263, - "grad_norm": 1.3328713065988675, + "grad_norm": 1.3367702451195411, "learning_rate": 6.483648427504466e-06, - "loss": 0.176, + "loss": 0.1766, "step": 2954 }, { "epoch": 0.4042407660738714, - "grad_norm": 1.4809834929388788, + "grad_norm": 1.4984337093121853, "learning_rate": 6.481596238516574e-06, - "loss": 0.2328, + "loss": 0.2362, "step": 2955 }, { "epoch": 0.4043775649794802, - "grad_norm": 1.417028155021493, + "grad_norm": 1.3829983566786073, "learning_rate": 6.479543775879061e-06, - "loss": 0.2192, + "loss": 0.2181, "step": 2956 }, { "epoch": 0.40451436388508893, - "grad_norm": 1.2486335046857961, + "grad_norm": 1.2249599700195943, "learning_rate": 6.4774910399710145e-06, - "loss": 0.1852, + "loss": 0.1826, "step": 2957 }, { "epoch": 0.4046511627906977, - "grad_norm": 1.2812769456675661, + "grad_norm": 1.257283577715599, "learning_rate": 6.475438031171574e-06, - "loss": 0.1789, + "loss": 0.1793, "step": 2958 }, { "epoch": 0.40478796169630643, - "grad_norm": 1.0926209250552386, + "grad_norm": 1.0828906094315078, "learning_rate": 6.4733847498599275e-06, - "loss": 0.1878, + "loss": 0.1877, "step": 2959 }, { "epoch": 0.4049247606019152, - "grad_norm": 1.0790480485403262, + "grad_norm": 1.0666884477022764, "learning_rate": 6.471331196415316e-06, "loss": 0.159, "step": 2960 }, { "epoch": 0.4050615595075239, - "grad_norm": 1.6825796045046928, + "grad_norm": 1.4613886992562901, "learning_rate": 6.469277371217026e-06, - "loss": 0.2158, + "loss": 0.2152, "step": 2961 }, { "epoch": 0.4051983584131327, - "grad_norm": 1.082914495690497, + "grad_norm": 1.0961087391258433, "learning_rate": 6.467223274644401e-06, - "loss": 0.1656, + "loss": 0.1675, "step": 2962 }, { "epoch": 0.4053351573187414, - "grad_norm": 1.2113121328111844, + "grad_norm": 1.2210168779554575, "learning_rate": 6.465168907076829e-06, - "loss": 0.1611, + "loss": 0.1607, "step": 2963 }, { "epoch": 0.4054719562243502, - "grad_norm": 1.2699820371302122, + "grad_norm": 1.2463956702515402, "learning_rate": 6.463114268893749e-06, - "loss": 0.1808, + "loss": 0.1796, "step": 2964 }, { "epoch": 0.405608755129959, - "grad_norm": 1.2624530934553544, + "grad_norm": 1.2456949997428766, "learning_rate": 6.461059360474654e-06, - "loss": 0.1824, + "loss": 0.1837, "step": 2965 }, { "epoch": 0.4057455540355677, - "grad_norm": 1.4118334560198047, + "grad_norm": 1.3733495945586538, "learning_rate": 6.459004182199082e-06, - "loss": 0.2029, + "loss": 0.2, "step": 2966 }, { "epoch": 0.40588235294117647, - "grad_norm": 1.2445407650921125, + "grad_norm": 1.2284534964392424, "learning_rate": 6.456948734446624e-06, - "loss": 0.1748, + "loss": 0.1759, "step": 2967 }, { "epoch": 0.4060191518467852, - "grad_norm": 1.287776043799077, + "grad_norm": 1.2514137206968479, "learning_rate": 6.454893017596918e-06, - "loss": 0.1931, + "loss": 0.1918, "step": 2968 }, { "epoch": 0.40615595075239397, - "grad_norm": 1.0525547134155604, + "grad_norm": 1.0405202897138315, "learning_rate": 6.452837032029653e-06, - "loss": 0.158, + "loss": 0.1584, "step": 2969 }, { "epoch": 0.4062927496580027, - "grad_norm": 1.260179273720075, + "grad_norm": 1.2961486581440584, "learning_rate": 6.45078077812457e-06, - "loss": 0.2087, + "loss": 0.2094, "step": 2970 }, { "epoch": 0.40642954856361146, - "grad_norm": 1.2853740974601928, + "grad_norm": 1.2856215907053727, "learning_rate": 6.448724256261456e-06, - "loss": 0.1999, + "loss": 0.2024, "step": 2971 }, { "epoch": 0.40656634746922027, - "grad_norm": 1.2761462195853588, + "grad_norm": 1.2394407120354929, "learning_rate": 6.446667466820148e-06, - "loss": 0.2255, + "loss": 0.2256, "step": 2972 }, { "epoch": 0.406703146374829, - "grad_norm": 1.0736947200644287, + "grad_norm": 1.0646996915976432, "learning_rate": 6.444610410180535e-06, - "loss": 0.1461, + "loss": 0.1459, "step": 2973 }, { "epoch": 0.40683994528043776, - "grad_norm": 1.503757079847265, + "grad_norm": 1.4833055204535581, "learning_rate": 6.442553086722554e-06, - "loss": 0.215, + "loss": 0.2144, "step": 2974 }, { "epoch": 0.4069767441860465, - "grad_norm": 1.2432106226126347, + "grad_norm": 1.2380381088851977, "learning_rate": 6.440495496826189e-06, - "loss": 0.1775, + "loss": 0.1781, "step": 2975 }, { "epoch": 0.40711354309165526, - "grad_norm": 1.3569547095460655, + "grad_norm": 1.3416593694214158, "learning_rate": 6.438437640871475e-06, - "loss": 0.1861, + "loss": 0.1852, "step": 2976 }, { "epoch": 0.407250341997264, - "grad_norm": 1.099079443033769, + "grad_norm": 1.0850807099393034, "learning_rate": 6.436379519238501e-06, - "loss": 0.1798, + "loss": 0.1794, "step": 2977 }, { "epoch": 0.40738714090287276, - "grad_norm": 1.494661681621546, + "grad_norm": 1.434877257813134, "learning_rate": 6.434321132307394e-06, - "loss": 0.228, + "loss": 0.2289, "step": 2978 }, { "epoch": 0.4075239398084815, - "grad_norm": 1.3559406540617946, + "grad_norm": 1.3034059994505534, "learning_rate": 6.432262480458341e-06, - "loss": 0.1961, + "loss": 0.1977, "step": 2979 }, { "epoch": 0.4076607387140903, - "grad_norm": 1.4162642607115246, + "grad_norm": 1.3969797273424622, "learning_rate": 6.43020356407157e-06, - "loss": 0.178, + "loss": 0.1776, "step": 2980 }, { "epoch": 0.40779753761969906, - "grad_norm": 1.387098744805585, + "grad_norm": 1.3785428502924084, "learning_rate": 6.428144383527364e-06, - "loss": 0.2402, + "loss": 0.2399, "step": 2981 }, { "epoch": 0.4079343365253078, - "grad_norm": 1.4916843409382157, + "grad_norm": 1.4809794927211828, "learning_rate": 6.426084939206051e-06, - "loss": 0.2536, + "loss": 0.2522, "step": 2982 }, { "epoch": 0.40807113543091655, - "grad_norm": 1.388352954833823, + "grad_norm": 1.3768355742273284, "learning_rate": 6.424025231488009e-06, - "loss": 0.1959, + "loss": 0.1949, "step": 2983 }, { "epoch": 0.4082079343365253, - "grad_norm": 1.1416087038184417, + "grad_norm": 1.127832595054998, "learning_rate": 6.421965260753662e-06, - "loss": 0.1663, + "loss": 0.1672, "step": 2984 }, { "epoch": 0.40834473324213405, - "grad_norm": 1.4368928309379654, + "grad_norm": 1.4089349762934205, "learning_rate": 6.419905027383488e-06, - "loss": 0.1889, + "loss": 0.1872, "step": 2985 }, { "epoch": 0.4084815321477428, - "grad_norm": 1.145504314434264, + "grad_norm": 1.134528850430615, "learning_rate": 6.417844531758009e-06, - "loss": 0.1758, + "loss": 0.1749, "step": 2986 }, { "epoch": 0.40861833105335155, - "grad_norm": 1.0866034033302623, + "grad_norm": 1.0910991399503838, "learning_rate": 6.415783774257798e-06, "loss": 0.1901, "step": 2987 }, { "epoch": 0.40875512995896035, - "grad_norm": 1.1005427529946743, + "grad_norm": 1.0790863309310856, "learning_rate": 6.413722755263473e-06, - "loss": 0.1574, + "loss": 0.1551, "step": 2988 }, { "epoch": 0.4088919288645691, - "grad_norm": 1.3427779235464348, + "grad_norm": 1.56290017446593, "learning_rate": 6.411661475155705e-06, - "loss": 0.2098, + "loss": 0.2061, "step": 2989 }, { "epoch": 0.40902872777017785, - "grad_norm": 1.081019710303934, + "grad_norm": 1.0876691512989687, "learning_rate": 6.409599934315209e-06, - "loss": 0.1672, + "loss": 0.1684, "step": 2990 }, { "epoch": 0.4091655266757866, - "grad_norm": 1.0916736130411528, + "grad_norm": 1.0679268622961442, "learning_rate": 6.4075381331227505e-06, - "loss": 0.1537, + "loss": 0.1515, "step": 2991 }, { "epoch": 0.40930232558139534, - "grad_norm": 1.4484007906381116, + "grad_norm": 1.4238782311555522, "learning_rate": 6.405476071959142e-06, - "loss": 0.1816, + "loss": 0.1825, "step": 2992 }, { "epoch": 0.4094391244870041, - "grad_norm": 1.6149793114425863, + "grad_norm": 1.5394945501735147, "learning_rate": 6.403413751205246e-06, - "loss": 0.2569, + "loss": 0.251, "step": 2993 }, { "epoch": 0.40957592339261284, - "grad_norm": 1.327543459379374, + "grad_norm": 1.1449561005780442, "learning_rate": 6.4013511712419705e-06, - "loss": 0.1781, + "loss": 0.175, "step": 2994 }, { "epoch": 0.4097127222982216, - "grad_norm": 1.4383322072089224, + "grad_norm": 1.4354800636543024, "learning_rate": 6.399288332450273e-06, - "loss": 0.2132, + "loss": 0.2148, "step": 2995 }, { "epoch": 0.4098495212038304, - "grad_norm": 1.158947174599692, + "grad_norm": 1.167208624566103, "learning_rate": 6.397225235211155e-06, - "loss": 0.1805, + "loss": 0.1835, "step": 2996 }, { "epoch": 0.40998632010943914, - "grad_norm": 0.9172950925779746, + "grad_norm": 0.9100535270774854, "learning_rate": 6.3951618799056735e-06, - "loss": 0.1577, + "loss": 0.1564, "step": 2997 }, { "epoch": 0.4101231190150479, - "grad_norm": 1.0116480495045734, + "grad_norm": 0.994634977804595, "learning_rate": 6.393098266914925e-06, - "loss": 0.1492, + "loss": 0.1501, "step": 2998 }, { "epoch": 0.41025991792065664, - "grad_norm": 1.2835224018875817, + "grad_norm": 1.2827139987089362, "learning_rate": 6.39103439662006e-06, - "loss": 0.1922, + "loss": 0.1954, "step": 2999 }, { "epoch": 0.4103967168262654, - "grad_norm": 1.6030436366620988, + "grad_norm": 1.5869988573576772, "learning_rate": 6.388970269402273e-06, - "loss": 0.2125, + "loss": 0.2133, "step": 3000 }, { "epoch": 0.4103967168262654, - "eval_loss": 0.1935969889163971, - "eval_runtime": 5.9181, - "eval_samples_per_second": 5.069, - "eval_steps_per_second": 1.352, + "eval_loss": 0.19245347380638123, + "eval_runtime": 5.9271, + "eval_samples_per_second": 5.062, + "eval_steps_per_second": 1.35, "step": 3000 }, { "epoch": 0.41053351573187413, - "grad_norm": 1.2631897432226455, + "grad_norm": 1.261991578641819, "learning_rate": 6.386905885642805e-06, - "loss": 0.1962, + "loss": 0.1965, "step": 3001 }, { "epoch": 0.4106703146374829, - "grad_norm": 1.421937095262969, + "grad_norm": 1.404016185688816, "learning_rate": 6.384841245722946e-06, - "loss": 0.2111, + "loss": 0.2126, "step": 3002 }, { "epoch": 0.41080711354309163, - "grad_norm": 1.1782162477600462, + "grad_norm": 1.1867576580303825, "learning_rate": 6.382776350024035e-06, - "loss": 0.2082, + "loss": 0.208, "step": 3003 }, { "epoch": 0.41094391244870043, - "grad_norm": 1.1004624976475523, + "grad_norm": 1.0905606801177576, "learning_rate": 6.380711198927455e-06, - "loss": 0.1693, + "loss": 0.1687, "step": 3004 }, { "epoch": 0.4110807113543092, - "grad_norm": 1.2598231792813595, + "grad_norm": 1.1829955569689738, "learning_rate": 6.378645792814639e-06, - "loss": 0.1951, + "loss": 0.1919, "step": 3005 }, { "epoch": 0.41121751025991793, - "grad_norm": 1.1811604948817653, + "grad_norm": 1.1633657262143584, "learning_rate": 6.376580132067065e-06, - "loss": 0.1604, + "loss": 0.1582, "step": 3006 }, { "epoch": 0.4113543091655267, - "grad_norm": 1.3422691696652673, + "grad_norm": 1.3351677041121894, "learning_rate": 6.374514217066259e-06, - "loss": 0.1635, + "loss": 0.1644, "step": 3007 }, { "epoch": 0.4114911080711354, - "grad_norm": 1.067412756621248, + "grad_norm": 1.0634882692347591, "learning_rate": 6.372448048193795e-06, "loss": 0.2095, "step": 3008 }, { "epoch": 0.4116279069767442, - "grad_norm": 1.5725294315316212, + "grad_norm": 1.551359118612189, "learning_rate": 6.370381625831292e-06, - "loss": 0.2144, + "loss": 0.2155, "step": 3009 }, { "epoch": 0.4117647058823529, - "grad_norm": 1.3792975487041215, + "grad_norm": 1.3697787243069226, "learning_rate": 6.368314950360416e-06, - "loss": 0.2111, + "loss": 0.2118, "step": 3010 }, { "epoch": 0.41190150478796167, - "grad_norm": 0.9884474579569749, + "grad_norm": 0.997968378998592, "learning_rate": 6.366248022162879e-06, - "loss": 0.1411, + "loss": 0.1401, "step": 3011 }, { "epoch": 0.4120383036935705, - "grad_norm": 1.3450676441152414, + "grad_norm": 1.3297879568155264, "learning_rate": 6.364180841620444e-06, - "loss": 0.2062, + "loss": 0.2052, "step": 3012 }, { "epoch": 0.4121751025991792, - "grad_norm": 1.462900501133529, + "grad_norm": 1.468525229754564, "learning_rate": 6.362113409114916e-06, - "loss": 0.2272, + "loss": 0.2304, "step": 3013 }, { "epoch": 0.41231190150478797, - "grad_norm": 1.1459196221821515, + "grad_norm": 1.138660974474528, "learning_rate": 6.360045725028147e-06, - "loss": 0.1893, + "loss": 0.1887, "step": 3014 }, { "epoch": 0.4124487004103967, - "grad_norm": 1.2058251219279783, + "grad_norm": 1.1965488102328605, "learning_rate": 6.357977789742038e-06, - "loss": 0.1988, + "loss": 0.2, "step": 3015 }, { "epoch": 0.41258549931600547, - "grad_norm": 1.3453904363939777, + "grad_norm": 1.3229891258804591, "learning_rate": 6.3559096036385356e-06, - "loss": 0.2183, + "loss": 0.2177, "step": 3016 }, { "epoch": 0.4127222982216142, - "grad_norm": 1.403088155899452, + "grad_norm": 1.5642065519838988, "learning_rate": 6.3538411670996305e-06, - "loss": 0.2316, + "loss": 0.2313, "step": 3017 }, { "epoch": 0.41285909712722296, - "grad_norm": 1.2567527293403622, + "grad_norm": 1.2474872022928671, "learning_rate": 6.3517724805073634e-06, - "loss": 0.1688, + "loss": 0.1705, "step": 3018 }, { "epoch": 0.4129958960328317, - "grad_norm": 1.3749217730217684, + "grad_norm": 1.3912022140036715, "learning_rate": 6.3497035442438156e-06, - "loss": 0.2293, + "loss": 0.2317, "step": 3019 }, { "epoch": 0.4131326949384405, - "grad_norm": 1.0905734698350362, + "grad_norm": 1.0834508125806424, "learning_rate": 6.347634358691121e-06, - "loss": 0.1779, + "loss": 0.1761, "step": 3020 }, { "epoch": 0.41326949384404926, - "grad_norm": 1.350009448132829, + "grad_norm": 1.3358812976723091, "learning_rate": 6.3455649242314535e-06, - "loss": 0.1939, + "loss": 0.1925, "step": 3021 }, { "epoch": 0.413406292749658, - "grad_norm": 1.4216276757957182, + "grad_norm": 1.396289133157578, "learning_rate": 6.34349524124704e-06, - "loss": 0.2244, + "loss": 0.2218, "step": 3022 }, { "epoch": 0.41354309165526676, - "grad_norm": 1.2523362761272134, + "grad_norm": 1.2386962089978242, "learning_rate": 6.341425310120146e-06, - "loss": 0.2075, + "loss": 0.2051, "step": 3023 }, { "epoch": 0.4136798905608755, - "grad_norm": 1.285245805085241, + "grad_norm": 1.253144861149669, "learning_rate": 6.339355131233089e-06, - "loss": 0.1611, + "loss": 0.1598, "step": 3024 }, { "epoch": 0.41381668946648426, - "grad_norm": 1.4801567376783542, + "grad_norm": 1.465258810857528, "learning_rate": 6.337284704968226e-06, - "loss": 0.2207, + "loss": 0.2206, "step": 3025 }, { "epoch": 0.413953488372093, - "grad_norm": 1.1900289888753128, + "grad_norm": 1.1718001139459353, "learning_rate": 6.335214031707966e-06, - "loss": 0.2101, + "loss": 0.2093, "step": 3026 }, { "epoch": 0.41409028727770175, - "grad_norm": 1.1212347525109325, + "grad_norm": 1.0926179284985038, "learning_rate": 6.333143111834757e-06, - "loss": 0.1834, + "loss": 0.1816, "step": 3027 }, { "epoch": 0.41422708618331056, - "grad_norm": 1.3342091631399269, + "grad_norm": 1.3230676519593068, "learning_rate": 6.3310719457311e-06, - "loss": 0.1982, + "loss": 0.1947, "step": 3028 }, { "epoch": 0.4143638850889193, - "grad_norm": 0.9916137612911221, + "grad_norm": 0.9910697110088089, "learning_rate": 6.3290005337795354e-06, - "loss": 0.1594, + "loss": 0.1595, "step": 3029 }, { "epoch": 0.41450068399452805, - "grad_norm": 1.2995139882779039, + "grad_norm": 1.282164507148486, "learning_rate": 6.3269288763626526e-06, - "loss": 0.1972, + "loss": 0.1955, "step": 3030 }, { "epoch": 0.4146374829001368, - "grad_norm": 1.2540788813984933, + "grad_norm": 1.2316802503853697, "learning_rate": 6.324856973863085e-06, - "loss": 0.1947, + "loss": 0.1916, "step": 3031 }, { "epoch": 0.41477428180574555, - "grad_norm": 1.128277884154219, + "grad_norm": 1.1409958264731712, "learning_rate": 6.322784826663512e-06, - "loss": 0.1886, + "loss": 0.1879, "step": 3032 }, { "epoch": 0.4149110807113543, - "grad_norm": 1.0858451723620706, + "grad_norm": 1.0627321468233915, "learning_rate": 6.320712435146654e-06, - "loss": 0.1589, + "loss": 0.1578, "step": 3033 }, { "epoch": 0.41504787961696304, - "grad_norm": 1.0823592603042442, + "grad_norm": 1.049506608001051, "learning_rate": 6.3186397996952845e-06, - "loss": 0.167, + "loss": 0.1663, "step": 3034 }, { "epoch": 0.4151846785225718, - "grad_norm": 1.4090713011680362, + "grad_norm": 1.4205716977199003, "learning_rate": 6.316566920692213e-06, - "loss": 0.1937, + "loss": 0.1955, "step": 3035 }, { "epoch": 0.4153214774281806, - "grad_norm": 1.2164725183320666, + "grad_norm": 1.2110178323337142, "learning_rate": 6.314493798520303e-06, - "loss": 0.1767, + "loss": 0.1779, "step": 3036 }, { "epoch": 0.41545827633378934, - "grad_norm": 1.1462088765176226, + "grad_norm": 1.125581402145334, "learning_rate": 6.312420433562455e-06, - "loss": 0.2042, + "loss": 0.2032, "step": 3037 }, { "epoch": 0.4155950752393981, - "grad_norm": 1.1695382532399663, + "grad_norm": 1.157402145596388, "learning_rate": 6.310346826201621e-06, - "loss": 0.1924, + "loss": 0.19, "step": 3038 }, { "epoch": 0.41573187414500684, - "grad_norm": 1.138360516982514, + "grad_norm": 1.1146962490794525, "learning_rate": 6.308272976820793e-06, - "loss": 0.1726, + "loss": 0.172, "step": 3039 }, { "epoch": 0.4158686730506156, - "grad_norm": 1.3114243557966423, + "grad_norm": 1.2811333269189649, "learning_rate": 6.3061988858030075e-06, - "loss": 0.2199, + "loss": 0.2206, "step": 3040 }, { "epoch": 0.41600547195622434, - "grad_norm": 1.1640435069404387, + "grad_norm": 1.1513951425646392, "learning_rate": 6.304124553531351e-06, - "loss": 0.193, + "loss": 0.1929, "step": 3041 }, { "epoch": 0.4161422708618331, - "grad_norm": 1.0799170490782677, + "grad_norm": 1.0708644359205612, "learning_rate": 6.302049980388948e-06, - "loss": 0.1879, + "loss": 0.1864, "step": 3042 }, { "epoch": 0.41627906976744183, - "grad_norm": 1.239739315490428, + "grad_norm": 1.2320065075089734, "learning_rate": 6.299975166758972e-06, - "loss": 0.1915, + "loss": 0.1921, "step": 3043 }, { "epoch": 0.41641586867305064, - "grad_norm": 1.2617119534766883, + "grad_norm": 1.2538374659721503, "learning_rate": 6.297900113024639e-06, - "loss": 0.1675, + "loss": 0.1652, "step": 3044 }, { "epoch": 0.4165526675786594, - "grad_norm": 1.20251251300186, + "grad_norm": 1.1925677020127992, "learning_rate": 6.29582481956921e-06, - "loss": 0.1921, + "loss": 0.194, "step": 3045 }, { "epoch": 0.41668946648426813, - "grad_norm": 1.3060525983446425, + "grad_norm": 1.302235950872785, "learning_rate": 6.29374928677599e-06, - "loss": 0.1671, + "loss": 0.1678, "step": 3046 }, { "epoch": 0.4168262653898769, - "grad_norm": 1.439430145790756, + "grad_norm": 1.46799273130687, "learning_rate": 6.291673515028327e-06, - "loss": 0.261, + "loss": 0.2631, "step": 3047 }, { "epoch": 0.41696306429548563, - "grad_norm": 1.449722364260765, + "grad_norm": 1.4666606987012636, "learning_rate": 6.289597504709617e-06, - "loss": 0.2045, + "loss": 0.204, "step": 3048 }, { "epoch": 0.4170998632010944, - "grad_norm": 1.1675713185216183, + "grad_norm": 1.1605241767943788, "learning_rate": 6.287521256203294e-06, - "loss": 0.1611, + "loss": 0.1589, "step": 3049 }, { "epoch": 0.4172366621067031, - "grad_norm": 1.0777765449665206, + "grad_norm": 1.0727063978430644, "learning_rate": 6.285444769892841e-06, - "loss": 0.1789, + "loss": 0.178, "step": 3050 }, { "epoch": 0.4173734610123119, - "grad_norm": 1.1384609786632003, + "grad_norm": 1.137494737858053, "learning_rate": 6.283368046161785e-06, - "loss": 0.1957, + "loss": 0.1983, "step": 3051 }, { "epoch": 0.4175102599179207, - "grad_norm": 1.2213339447876146, + "grad_norm": 1.2150157754257827, "learning_rate": 6.2812910853936925e-06, - "loss": 0.1735, + "loss": 0.1726, "step": 3052 }, { "epoch": 0.4176470588235294, - "grad_norm": 1.2771370748210729, + "grad_norm": 1.2521245019281215, "learning_rate": 6.279213887972179e-06, - "loss": 0.1956, + "loss": 0.1927, "step": 3053 }, { "epoch": 0.4177838577291382, - "grad_norm": 1.363438213652839, + "grad_norm": 1.3367733532708592, "learning_rate": 6.277136454280899e-06, - "loss": 0.2146, + "loss": 0.2114, "step": 3054 }, { "epoch": 0.4179206566347469, - "grad_norm": 1.3647052169393645, + "grad_norm": 1.3291403366773156, "learning_rate": 6.275058784703554e-06, - "loss": 0.2224, + "loss": 0.2217, "step": 3055 }, { "epoch": 0.41805745554035567, - "grad_norm": 1.3820421046484925, + "grad_norm": 1.3684546901414223, "learning_rate": 6.272980879623888e-06, - "loss": 0.2141, + "loss": 0.2158, "step": 3056 }, { "epoch": 0.4181942544459644, - "grad_norm": 1.4291206758873298, + "grad_norm": 1.4012656610369583, "learning_rate": 6.270902739425686e-06, - "loss": 0.1919, + "loss": 0.1858, "step": 3057 }, { "epoch": 0.41833105335157317, - "grad_norm": 1.3310174417975769, + "grad_norm": 1.3074274186251613, "learning_rate": 6.2688243644927825e-06, - "loss": 0.2075, + "loss": 0.2111, "step": 3058 }, { "epoch": 0.4184678522571819, - "grad_norm": 1.189560375184643, + "grad_norm": 1.180348425862876, "learning_rate": 6.266745755209049e-06, - "loss": 0.1862, + "loss": 0.187, "step": 3059 }, { "epoch": 0.4186046511627907, - "grad_norm": 1.3854489366318068, + "grad_norm": 1.3623392049457437, "learning_rate": 6.264666911958404e-06, - "loss": 0.2002, + "loss": 0.1979, "step": 3060 }, { "epoch": 0.41874145006839947, - "grad_norm": 1.1614292804748028, + "grad_norm": 1.14112582998465, "learning_rate": 6.262587835124808e-06, - "loss": 0.2236, + "loss": 0.2237, "step": 3061 }, { "epoch": 0.4188782489740082, - "grad_norm": 1.2255152940508074, + "grad_norm": 1.233613330806065, "learning_rate": 6.2605085250922655e-06, "loss": 0.197, "step": 3062 }, { "epoch": 0.41901504787961696, - "grad_norm": 1.3965845971647806, + "grad_norm": 1.3513687714731162, "learning_rate": 6.2584289822448226e-06, - "loss": 0.2084, + "loss": 0.2091, "step": 3063 }, { "epoch": 0.4191518467852257, - "grad_norm": 1.1708973828527203, + "grad_norm": 1.1665911668989761, "learning_rate": 6.25634920696657e-06, - "loss": 0.1611, + "loss": 0.1612, "step": 3064 }, { "epoch": 0.41928864569083446, - "grad_norm": 1.473814381827543, + "grad_norm": 1.443968639776343, "learning_rate": 6.254269199641637e-06, - "loss": 0.2331, + "loss": 0.2312, "step": 3065 }, { "epoch": 0.4194254445964432, - "grad_norm": 1.274858598987884, + "grad_norm": 1.2569000102700487, "learning_rate": 6.252188960654204e-06, - "loss": 0.2354, + "loss": 0.236, "step": 3066 }, { "epoch": 0.41956224350205196, - "grad_norm": 1.3148133582896495, + "grad_norm": 1.2829479737963896, "learning_rate": 6.250108490388487e-06, - "loss": 0.2172, + "loss": 0.2136, "step": 3067 }, { "epoch": 0.41969904240766076, - "grad_norm": 1.219972655040274, + "grad_norm": 1.207146858855832, "learning_rate": 6.248027789228748e-06, - "loss": 0.1911, + "loss": 0.1903, "step": 3068 }, { "epoch": 0.4198358413132695, - "grad_norm": 1.1352226022776517, + "grad_norm": 1.076583405985108, "learning_rate": 6.24594685755929e-06, - "loss": 0.1466, + "loss": 0.145, "step": 3069 }, { "epoch": 0.41997264021887826, - "grad_norm": 1.1916452724066622, + "grad_norm": 1.1670474419609593, "learning_rate": 6.243865695764459e-06, - "loss": 0.1746, + "loss": 0.1754, "step": 3070 }, { "epoch": 0.420109439124487, - "grad_norm": 1.2920806066461243, + "grad_norm": 1.2966098990127666, "learning_rate": 6.241784304228647e-06, - "loss": 0.1968, + "loss": 0.1973, "step": 3071 }, { "epoch": 0.42024623803009575, - "grad_norm": 1.2129353152695916, + "grad_norm": 1.198175334472774, "learning_rate": 6.239702683336281e-06, - "loss": 0.2012, + "loss": 0.2, "step": 3072 }, { "epoch": 0.4203830369357045, - "grad_norm": 1.395517376577143, + "grad_norm": 1.5015228908109446, "learning_rate": 6.237620833471838e-06, - "loss": 0.2345, + "loss": 0.2326, "step": 3073 }, { "epoch": 0.42051983584131325, - "grad_norm": 1.4345364178532705, + "grad_norm": 1.4197228880209138, "learning_rate": 6.235538755019832e-06, - "loss": 0.2509, + "loss": 0.2518, "step": 3074 }, { "epoch": 0.420656634746922, - "grad_norm": 1.0614856800965955, + "grad_norm": 1.0574630014336934, "learning_rate": 6.233456448364823e-06, - "loss": 0.1721, + "loss": 0.1744, "step": 3075 }, { "epoch": 0.4207934336525308, - "grad_norm": 1.0934538252805612, + "grad_norm": 1.0720844227327575, "learning_rate": 6.231373913891409e-06, - "loss": 0.1839, + "loss": 0.1817, "step": 3076 }, { "epoch": 0.42093023255813955, - "grad_norm": 1.4083998423018338, + "grad_norm": 1.420313559211638, "learning_rate": 6.229291151984233e-06, - "loss": 0.2064, + "loss": 0.2063, "step": 3077 }, { "epoch": 0.4210670314637483, - "grad_norm": 1.081179651211749, + "grad_norm": 1.0689452822032985, "learning_rate": 6.227208163027982e-06, - "loss": 0.2021, + "loss": 0.2023, "step": 3078 }, { "epoch": 0.42120383036935705, - "grad_norm": 1.1523921803435933, + "grad_norm": 1.1249282205096234, "learning_rate": 6.2251249474073795e-06, - "loss": 0.2038, + "loss": 0.2005, "step": 3079 }, { "epoch": 0.4213406292749658, - "grad_norm": 0.8922443665788976, + "grad_norm": 0.8919349278791147, "learning_rate": 6.2230415055071945e-06, - "loss": 0.144, + "loss": 0.1441, "step": 3080 }, { "epoch": 0.42147742818057454, - "grad_norm": 0.9430721370318758, + "grad_norm": 0.9338903372820355, "learning_rate": 6.220957837712238e-06, - "loss": 0.1568, + "loss": 0.1577, "step": 3081 }, { "epoch": 0.4216142270861833, - "grad_norm": 1.2624494480849517, + "grad_norm": 1.2523594952015689, "learning_rate": 6.218873944407361e-06, - "loss": 0.2096, + "loss": 0.2128, "step": 3082 }, { "epoch": 0.42175102599179204, - "grad_norm": 1.1515374086207424, + "grad_norm": 1.1869529892958044, "learning_rate": 6.2167898259774576e-06, - "loss": 0.1577, + "loss": 0.1615, "step": 3083 }, { "epoch": 0.42188782489740084, - "grad_norm": 1.4396152327393497, + "grad_norm": 1.3889583982126965, "learning_rate": 6.21470548280746e-06, - "loss": 0.2247, + "loss": 0.2203, "step": 3084 }, { "epoch": 0.4220246238030096, - "grad_norm": 1.2993462861575065, + "grad_norm": 1.2010719040781364, "learning_rate": 6.212620915282348e-06, - "loss": 0.1989, + "loss": 0.1974, "step": 3085 }, { "epoch": 0.42216142270861834, - "grad_norm": 1.3890311926586805, + "grad_norm": 1.3574456761138212, "learning_rate": 6.210536123787138e-06, - "loss": 0.2082, + "loss": 0.2056, "step": 3086 }, { "epoch": 0.4222982216142271, - "grad_norm": 1.2407585647682609, + "grad_norm": 1.2147923570237154, "learning_rate": 6.208451108706889e-06, - "loss": 0.1909, + "loss": 0.1894, "step": 3087 }, { "epoch": 0.42243502051983584, - "grad_norm": 1.269921925186175, + "grad_norm": 1.2619136638685364, "learning_rate": 6.206365870426703e-06, - "loss": 0.1964, + "loss": 0.1948, "step": 3088 }, { "epoch": 0.4225718194254446, - "grad_norm": 1.0552076676454594, + "grad_norm": 1.1155630944914872, "learning_rate": 6.204280409331721e-06, - "loss": 0.1919, + "loss": 0.1957, "step": 3089 }, { "epoch": 0.42270861833105333, - "grad_norm": 1.3059656458902364, + "grad_norm": 1.2913199489907228, "learning_rate": 6.202194725807127e-06, - "loss": 0.1835, + "loss": 0.18, "step": 3090 }, { "epoch": 0.4228454172366621, - "grad_norm": 1.3576844843600753, + "grad_norm": 1.3512866564632973, "learning_rate": 6.200108820238142e-06, - "loss": 0.1769, + "loss": 0.1757, "step": 3091 }, { "epoch": 0.4229822161422709, - "grad_norm": 1.5297591489327074, + "grad_norm": 1.5312809016882163, "learning_rate": 6.198022693010033e-06, - "loss": 0.2409, + "loss": 0.2432, "step": 3092 }, { "epoch": 0.42311901504787963, - "grad_norm": 1.0986991958204355, + "grad_norm": 1.0914072622938422, "learning_rate": 6.195936344508108e-06, - "loss": 0.177, + "loss": 0.1762, "step": 3093 }, { "epoch": 0.4232558139534884, - "grad_norm": 1.2516398614835225, + "grad_norm": 1.2491431963813922, "learning_rate": 6.193849775117709e-06, - "loss": 0.1909, + "loss": 0.1873, "step": 3094 }, { "epoch": 0.42339261285909713, - "grad_norm": 1.4307411145996187, + "grad_norm": 1.4042748591998386, "learning_rate": 6.19176298522423e-06, - "loss": 0.2408, + "loss": 0.2404, "step": 3095 }, { "epoch": 0.4235294117647059, - "grad_norm": 1.1866679236124271, + "grad_norm": 1.1452477331770707, "learning_rate": 6.189675975213094e-06, - "loss": 0.2053, + "loss": 0.2036, "step": 3096 }, { "epoch": 0.4236662106703146, - "grad_norm": 1.4735628844015536, + "grad_norm": 1.5338822654871391, "learning_rate": 6.187588745469774e-06, - "loss": 0.2343, + "loss": 0.2368, "step": 3097 }, { "epoch": 0.4238030095759234, - "grad_norm": 1.1418415019380583, + "grad_norm": 1.1328272757389002, "learning_rate": 6.1855012963797765e-06, - "loss": 0.1698, + "loss": 0.1711, "step": 3098 }, { "epoch": 0.4239398084815321, - "grad_norm": 1.2355221055470857, + "grad_norm": 1.2147617946305185, "learning_rate": 6.183413628328653e-06, - "loss": 0.1718, + "loss": 0.1729, "step": 3099 }, { "epoch": 0.4240766073871409, - "grad_norm": 2.0950350924707895, + "grad_norm": 1.4118368561245824, "learning_rate": 6.181325741701993e-06, - "loss": 0.2202, + "loss": 0.2151, "step": 3100 }, { "epoch": 0.4240766073871409, - "eval_loss": 0.19308552145957947, - "eval_runtime": 5.9021, - "eval_samples_per_second": 5.083, - "eval_steps_per_second": 1.355, + "eval_loss": 0.1928262561559677, + "eval_runtime": 5.9133, + "eval_samples_per_second": 5.073, + "eval_steps_per_second": 1.353, "step": 3100 }, { "epoch": 0.4242134062927497, - "grad_norm": 1.3532971070953659, + "grad_norm": 1.3026472007068117, "learning_rate": 6.179237636885429e-06, - "loss": 0.2409, + "loss": 0.2376, "step": 3101 }, { "epoch": 0.4243502051983584, - "grad_norm": 1.3235387930141622, + "grad_norm": 1.3320466349368838, "learning_rate": 6.177149314264631e-06, - "loss": 0.2202, + "loss": 0.224, "step": 3102 }, { "epoch": 0.42448700410396717, - "grad_norm": 1.2426275857156224, + "grad_norm": 1.2366564553150279, "learning_rate": 6.175060774225312e-06, - "loss": 0.1887, + "loss": 0.1884, "step": 3103 }, { "epoch": 0.4246238030095759, - "grad_norm": 1.1106637219117244, + "grad_norm": 1.1682243993413959, "learning_rate": 6.172972017153224e-06, "loss": 0.1749, "step": 3104 }, { "epoch": 0.42476060191518467, - "grad_norm": 1.376068670016819, + "grad_norm": 1.362304271825297, "learning_rate": 6.170883043434156e-06, - "loss": 0.2632, + "loss": 0.264, "step": 3105 }, { "epoch": 0.4248974008207934, - "grad_norm": 1.1051173039308775, + "grad_norm": 1.077964327788642, "learning_rate": 6.1687938534539425e-06, - "loss": 0.1729, + "loss": 0.1743, "step": 3106 }, { "epoch": 0.42503419972640216, - "grad_norm": 1.1498938004225672, + "grad_norm": 1.1265486140976013, "learning_rate": 6.166704447598453e-06, - "loss": 0.1677, + "loss": 0.165, "step": 3107 }, { "epoch": 0.42517099863201097, - "grad_norm": 1.263911223761578, + "grad_norm": 1.288000793752681, "learning_rate": 6.1646148262535996e-06, - "loss": 0.2026, + "loss": 0.2063, "step": 3108 }, { "epoch": 0.4253077975376197, - "grad_norm": 1.4392171058910572, + "grad_norm": 1.422772435761083, "learning_rate": 6.162524989805334e-06, - "loss": 0.2019, + "loss": 0.2044, "step": 3109 }, { "epoch": 0.42544459644322846, - "grad_norm": 1.3618889541002177, + "grad_norm": 1.408302193907209, "learning_rate": 6.160434938639649e-06, - "loss": 0.1863, + "loss": 0.1877, "step": 3110 }, { "epoch": 0.4255813953488372, - "grad_norm": 1.0985867080781722, + "grad_norm": 1.1093246830774002, "learning_rate": 6.158344673142573e-06, - "loss": 0.1573, + "loss": 0.1579, "step": 3111 }, { "epoch": 0.42571819425444596, - "grad_norm": 1.3760629780065752, + "grad_norm": 1.3397035521870575, "learning_rate": 6.156254193700178e-06, - "loss": 0.2024, + "loss": 0.2013, "step": 3112 }, { "epoch": 0.4258549931600547, - "grad_norm": 1.432743809561242, + "grad_norm": 1.4582523902027726, "learning_rate": 6.154163500698571e-06, - "loss": 0.2176, + "loss": 0.2237, "step": 3113 }, { "epoch": 0.42599179206566345, - "grad_norm": 1.0929249345069405, + "grad_norm": 1.051792040802391, "learning_rate": 6.152072594523905e-06, - "loss": 0.1918, + "loss": 0.1912, "step": 3114 }, { "epoch": 0.4261285909712722, - "grad_norm": 1.2244303625738737, + "grad_norm": 1.2380011188910631, "learning_rate": 6.149981475562366e-06, - "loss": 0.2075, + "loss": 0.2097, "step": 3115 }, { "epoch": 0.426265389876881, - "grad_norm": 1.1920651806667482, + "grad_norm": 1.1963763074712233, "learning_rate": 6.1478901442001825e-06, - "loss": 0.2021, + "loss": 0.2024, "step": 3116 }, { "epoch": 0.42640218878248975, - "grad_norm": 1.1558926133575396, + "grad_norm": 1.1309495818151625, "learning_rate": 6.145798600823622e-06, - "loss": 0.1736, + "loss": 0.1743, "step": 3117 }, { "epoch": 0.4265389876880985, - "grad_norm": 1.2168956809374556, + "grad_norm": 1.2097709676174435, "learning_rate": 6.143706845818993e-06, - "loss": 0.1876, + "loss": 0.1877, "step": 3118 }, { "epoch": 0.42667578659370725, - "grad_norm": 1.2221821754934938, + "grad_norm": 1.23198846474051, "learning_rate": 6.1416148795726375e-06, - "loss": 0.1717, + "loss": 0.172, "step": 3119 }, { "epoch": 0.426812585499316, - "grad_norm": 1.2669663117865337, + "grad_norm": 1.2308432070043593, "learning_rate": 6.139522702470942e-06, - "loss": 0.1726, + "loss": 0.1737, "step": 3120 }, { "epoch": 0.42694938440492475, - "grad_norm": 1.2600160277695183, + "grad_norm": 1.2420899236608358, "learning_rate": 6.137430314900329e-06, - "loss": 0.1718, + "loss": 0.1712, "step": 3121 }, { "epoch": 0.4270861833105335, - "grad_norm": 1.0564250772947859, + "grad_norm": 1.0555336804185222, "learning_rate": 6.1353377172472615e-06, - "loss": 0.1619, + "loss": 0.1631, "step": 3122 }, { "epoch": 0.42722298221614224, - "grad_norm": 1.4396268101329979, + "grad_norm": 1.4373533473265672, "learning_rate": 6.133244909898238e-06, - "loss": 0.1942, + "loss": 0.1941, "step": 3123 }, { "epoch": 0.42735978112175105, - "grad_norm": 1.4696597711206625, + "grad_norm": 1.4591007011761794, "learning_rate": 6.1311518932398015e-06, - "loss": 0.2056, + "loss": 0.2032, "step": 3124 }, { "epoch": 0.4274965800273598, - "grad_norm": 1.251082554560237, + "grad_norm": 1.2296568524797913, "learning_rate": 6.12905866765853e-06, - "loss": 0.1775, + "loss": 0.1733, "step": 3125 }, { "epoch": 0.42763337893296854, - "grad_norm": 1.578506325423231, + "grad_norm": 1.5983701582152072, "learning_rate": 6.126965233541041e-06, - "loss": 0.2363, + "loss": 0.2385, "step": 3126 }, { "epoch": 0.4277701778385773, - "grad_norm": 1.1212530478546567, + "grad_norm": 1.1244520265069953, "learning_rate": 6.1248715912739885e-06, - "loss": 0.1736, + "loss": 0.1735, "step": 3127 }, { "epoch": 0.42790697674418604, - "grad_norm": 1.4596813434707296, + "grad_norm": 1.422585731970573, "learning_rate": 6.122777741244067e-06, - "loss": 0.1831, + "loss": 0.1806, "step": 3128 }, { "epoch": 0.4280437756497948, - "grad_norm": 1.4175863335847796, + "grad_norm": 1.388845488022246, "learning_rate": 6.120683683838009e-06, - "loss": 0.1842, + "loss": 0.1805, "step": 3129 }, { "epoch": 0.42818057455540354, - "grad_norm": 1.3077532511778285, + "grad_norm": 1.3070289941438133, "learning_rate": 6.118589419442584e-06, - "loss": 0.2018, + "loss": 0.2057, "step": 3130 }, { "epoch": 0.4283173734610123, - "grad_norm": 1.2151673158320693, + "grad_norm": 1.2284379455457792, "learning_rate": 6.116494948444605e-06, - "loss": 0.1958, + "loss": 0.1969, "step": 3131 }, { "epoch": 0.4284541723666211, - "grad_norm": 1.475653242043237, + "grad_norm": 1.4587759107209253, "learning_rate": 6.114400271230914e-06, - "loss": 0.2274, + "loss": 0.2242, "step": 3132 }, { "epoch": 0.42859097127222984, - "grad_norm": 1.3765545928342293, + "grad_norm": 1.7553250746460227, "learning_rate": 6.112305388188398e-06, - "loss": 0.1947, + "loss": 0.194, "step": 3133 }, { "epoch": 0.4287277701778386, - "grad_norm": 1.4074365567409055, + "grad_norm": 1.3779564382846776, "learning_rate": 6.110210299703982e-06, - "loss": 0.2349, + "loss": 0.2336, "step": 3134 }, { "epoch": 0.42886456908344733, - "grad_norm": 1.0854116770162148, + "grad_norm": 1.0992232360700451, "learning_rate": 6.1081150061646255e-06, - "loss": 0.1968, + "loss": 0.1976, "step": 3135 }, { "epoch": 0.4290013679890561, - "grad_norm": 1.3094236134768544, + "grad_norm": 1.3197076892684947, "learning_rate": 6.106019507957328e-06, - "loss": 0.1999, + "loss": 0.2015, "step": 3136 }, { "epoch": 0.42913816689466483, - "grad_norm": 1.445559489973292, + "grad_norm": 1.459157314072455, "learning_rate": 6.103923805469127e-06, - "loss": 0.2068, + "loss": 0.206, "step": 3137 }, { "epoch": 0.4292749658002736, - "grad_norm": 1.3963189718500113, + "grad_norm": 1.407581859799565, "learning_rate": 6.101827899087094e-06, - "loss": 0.2612, + "loss": 0.2634, "step": 3138 }, { "epoch": 0.4294117647058823, - "grad_norm": 1.0195262713644588, + "grad_norm": 1.0037279193939495, "learning_rate": 6.099731789198344e-06, - "loss": 0.1612, + "loss": 0.1599, "step": 3139 }, { "epoch": 0.42954856361149113, - "grad_norm": 1.3618541943253482, + "grad_norm": 1.3350343083823983, "learning_rate": 6.097635476190026e-06, - "loss": 0.2168, + "loss": 0.2154, "step": 3140 }, { "epoch": 0.4296853625170999, - "grad_norm": 1.0279934477285202, + "grad_norm": 1.0045911382330202, "learning_rate": 6.0955389604493275e-06, - "loss": 0.1798, + "loss": 0.1779, "step": 3141 }, { "epoch": 0.4298221614227086, - "grad_norm": 1.1975914579889984, + "grad_norm": 1.20007216466347, "learning_rate": 6.093442242363474e-06, - "loss": 0.174, + "loss": 0.1759, "step": 3142 }, { "epoch": 0.4299589603283174, - "grad_norm": 1.157684727564595, + "grad_norm": 1.1580581023762453, "learning_rate": 6.091345322319727e-06, - "loss": 0.1767, + "loss": 0.1769, "step": 3143 }, { "epoch": 0.4300957592339261, - "grad_norm": 1.252119074437994, + "grad_norm": 1.2473473073766448, "learning_rate": 6.089248200705386e-06, - "loss": 0.2078, + "loss": 0.2084, "step": 3144 }, { "epoch": 0.43023255813953487, - "grad_norm": 1.2151201645912435, + "grad_norm": 1.2231675552201278, "learning_rate": 6.087150877907786e-06, - "loss": 0.1842, + "loss": 0.1844, "step": 3145 }, { "epoch": 0.4303693570451436, - "grad_norm": 1.4204497965127092, + "grad_norm": 1.4013645202391183, "learning_rate": 6.085053354314302e-06, - "loss": 0.2402, + "loss": 0.2436, "step": 3146 }, { "epoch": 0.43050615595075237, - "grad_norm": 1.3412346390861036, + "grad_norm": 1.3263174066416172, "learning_rate": 6.082955630312347e-06, - "loss": 0.2012, + "loss": 0.1998, "step": 3147 }, { "epoch": 0.43064295485636117, - "grad_norm": 1.271134046545374, + "grad_norm": 1.260169286985461, "learning_rate": 6.0808577062893644e-06, - "loss": 0.2204, + "loss": 0.2187, "step": 3148 }, { "epoch": 0.4307797537619699, - "grad_norm": 1.3136199800211308, + "grad_norm": 1.2670663814536849, "learning_rate": 6.078759582632845e-06, - "loss": 0.192, + "loss": 0.1903, "step": 3149 }, { "epoch": 0.43091655266757867, - "grad_norm": 1.4053003934033974, + "grad_norm": 1.3961984017100206, "learning_rate": 6.076661259730305e-06, - "loss": 0.2169, + "loss": 0.2177, "step": 3150 }, { "epoch": 0.4310533515731874, - "grad_norm": 1.29879009014704, + "grad_norm": 1.3004549187769683, "learning_rate": 6.0745627379693065e-06, - "loss": 0.2041, + "loss": 0.2055, "step": 3151 }, { "epoch": 0.43119015047879616, - "grad_norm": 1.1828949254362096, + "grad_norm": 1.175287131536144, "learning_rate": 6.072464017737443e-06, - "loss": 0.1861, + "loss": 0.1858, "step": 3152 }, { "epoch": 0.4313269493844049, - "grad_norm": 0.9510603533080373, + "grad_norm": 0.9643084938524166, "learning_rate": 6.070365099422346e-06, - "loss": 0.1504, + "loss": 0.1521, "step": 3153 }, { "epoch": 0.43146374829001366, - "grad_norm": 1.4399884867838961, + "grad_norm": 1.478005243515807, "learning_rate": 6.068265983411685e-06, - "loss": 0.2141, + "loss": 0.2151, "step": 3154 }, { "epoch": 0.4316005471956224, - "grad_norm": 1.2858957037757017, + "grad_norm": 1.2516029849743509, "learning_rate": 6.066166670093167e-06, - "loss": 0.1957, + "loss": 0.196, "step": 3155 }, { "epoch": 0.4317373461012312, - "grad_norm": 1.3513877964993786, + "grad_norm": 1.3402565563758724, "learning_rate": 6.064067159854528e-06, - "loss": 0.2372, + "loss": 0.2357, "step": 3156 }, { "epoch": 0.43187414500683996, - "grad_norm": 1.386529262961155, + "grad_norm": 1.3895705636258984, "learning_rate": 6.061967453083552e-06, - "loss": 0.1934, + "loss": 0.1922, "step": 3157 }, { "epoch": 0.4320109439124487, - "grad_norm": 1.2751013610403894, + "grad_norm": 1.235583232098702, "learning_rate": 6.05986755016805e-06, - "loss": 0.1767, + "loss": 0.1749, "step": 3158 }, { "epoch": 0.43214774281805746, - "grad_norm": 1.3505364119856027, + "grad_norm": 1.350008015781317, "learning_rate": 6.057767451495872e-06, "loss": 0.2167, "step": 3159 }, { "epoch": 0.4322845417236662, - "grad_norm": 1.0480176313446088, + "grad_norm": 1.0365155406761535, "learning_rate": 6.0556671574549055e-06, - "loss": 0.1733, + "loss": 0.173, "step": 3160 }, { "epoch": 0.43242134062927495, - "grad_norm": 1.5144032500344642, + "grad_norm": 1.501414731184165, "learning_rate": 6.053566668433072e-06, - "loss": 0.2378, + "loss": 0.2358, "step": 3161 }, { "epoch": 0.4325581395348837, - "grad_norm": 1.3091534072635718, + "grad_norm": 1.307063277929452, "learning_rate": 6.051465984818332e-06, - "loss": 0.2429, + "loss": 0.2431, "step": 3162 }, { "epoch": 0.43269493844049245, - "grad_norm": 1.4167453085030586, + "grad_norm": 1.4244795566099833, "learning_rate": 6.049365106998679e-06, - "loss": 0.2238, + "loss": 0.2255, "step": 3163 }, { "epoch": 0.43283173734610125, - "grad_norm": 1.264799380560274, + "grad_norm": 1.243590376856035, "learning_rate": 6.047264035362143e-06, - "loss": 0.1917, + "loss": 0.1902, "step": 3164 }, { "epoch": 0.43296853625171, - "grad_norm": 0.945150578989285, + "grad_norm": 1.062892182993091, "learning_rate": 6.045162770296791e-06, - "loss": 0.1469, + "loss": 0.1496, "step": 3165 }, { "epoch": 0.43310533515731875, - "grad_norm": 1.3079993854497114, + "grad_norm": 1.3021771325714482, "learning_rate": 6.043061312190723e-06, - "loss": 0.2344, + "loss": 0.2342, "step": 3166 }, { "epoch": 0.4332421340629275, - "grad_norm": 0.9879491779225611, + "grad_norm": 0.968538677026631, "learning_rate": 6.0409596614320795e-06, - "loss": 0.1787, + "loss": 0.1764, "step": 3167 }, { "epoch": 0.43337893296853625, - "grad_norm": 1.1352767416586544, + "grad_norm": 1.0971121312324803, "learning_rate": 6.038857818409033e-06, - "loss": 0.1761, + "loss": 0.1758, "step": 3168 }, { "epoch": 0.433515731874145, - "grad_norm": 1.3710145894550485, + "grad_norm": 1.352009883622734, "learning_rate": 6.036755783509792e-06, - "loss": 0.2121, + "loss": 0.2119, "step": 3169 }, { "epoch": 0.43365253077975374, - "grad_norm": 1.4147727191195458, + "grad_norm": 1.4009628229793534, "learning_rate": 6.034653557122599e-06, - "loss": 0.2097, + "loss": 0.2115, "step": 3170 }, { "epoch": 0.4337893296853625, - "grad_norm": 1.483757888502293, + "grad_norm": 1.4720276443416533, "learning_rate": 6.032551139635735e-06, - "loss": 0.2234, + "loss": 0.2191, "step": 3171 }, { "epoch": 0.4339261285909713, - "grad_norm": 1.3823851876997268, + "grad_norm": 1.3661842766808008, "learning_rate": 6.0304485314375136e-06, - "loss": 0.2481, + "loss": 0.2457, "step": 3172 }, { "epoch": 0.43406292749658004, - "grad_norm": 0.9510962101395161, + "grad_norm": 0.9481746130796717, "learning_rate": 6.028345732916286e-06, - "loss": 0.1515, + "loss": 0.151, "step": 3173 }, { "epoch": 0.4341997264021888, - "grad_norm": 1.2621997108675291, + "grad_norm": 1.2832027289924004, "learning_rate": 6.026242744460439e-06, - "loss": 0.1927, + "loss": 0.1955, "step": 3174 }, { "epoch": 0.43433652530779754, - "grad_norm": 1.1749657912977927, + "grad_norm": 1.1758392892703324, "learning_rate": 6.024139566458389e-06, - "loss": 0.1676, + "loss": 0.1695, "step": 3175 }, { "epoch": 0.4344733242134063, - "grad_norm": 1.3815024918368253, + "grad_norm": 1.4012820943071154, "learning_rate": 6.022036199298594e-06, - "loss": 0.195, + "loss": 0.1973, "step": 3176 }, { "epoch": 0.43461012311901503, - "grad_norm": 1.250557067810704, + "grad_norm": 1.2300737317536066, "learning_rate": 6.019932643369544e-06, - "loss": 0.2253, + "loss": 0.226, "step": 3177 }, { "epoch": 0.4347469220246238, - "grad_norm": 1.0511824174152329, + "grad_norm": 1.0439902739341056, "learning_rate": 6.017828899059763e-06, - "loss": 0.1446, + "loss": 0.1439, "step": 3178 }, { "epoch": 0.43488372093023253, - "grad_norm": 1.3127798634695038, + "grad_norm": 1.3016162875419306, "learning_rate": 6.015724966757812e-06, - "loss": 0.2347, + "loss": 0.2316, "step": 3179 }, { "epoch": 0.43502051983584133, - "grad_norm": 1.2030653136508223, + "grad_norm": 1.1918373012235501, "learning_rate": 6.013620846852284e-06, - "loss": 0.1904, + "loss": 0.192, "step": 3180 }, { "epoch": 0.4351573187414501, - "grad_norm": 1.3125751363223641, + "grad_norm": 1.2964325517082131, "learning_rate": 6.01151653973181e-06, - "loss": 0.2073, + "loss": 0.2074, "step": 3181 }, { "epoch": 0.43529411764705883, - "grad_norm": 1.4225363804805746, + "grad_norm": 1.4372086933612804, "learning_rate": 6.009412045785051e-06, - "loss": 0.2218, + "loss": 0.2244, "step": 3182 }, { "epoch": 0.4354309165526676, - "grad_norm": 1.5281156991788039, + "grad_norm": 1.4899209141056704, "learning_rate": 6.00730736540071e-06, - "loss": 0.1978, + "loss": 0.1953, "step": 3183 }, { "epoch": 0.4355677154582763, - "grad_norm": 1.0383741644527615, + "grad_norm": 1.030913912193497, "learning_rate": 6.005202498967516e-06, - "loss": 0.1492, + "loss": 0.149, "step": 3184 }, { "epoch": 0.4357045143638851, - "grad_norm": 1.2739434901808864, + "grad_norm": 1.267816189420223, "learning_rate": 6.0030974468742375e-06, - "loss": 0.1865, + "loss": 0.1873, "step": 3185 }, { "epoch": 0.4358413132694938, - "grad_norm": 1.085687826347015, + "grad_norm": 1.0853793502138946, "learning_rate": 6.0009922095096765e-06, - "loss": 0.1532, + "loss": 0.1535, "step": 3186 }, { "epoch": 0.4359781121751026, - "grad_norm": 1.0226727564988276, + "grad_norm": 1.0149182138697606, "learning_rate": 5.9988867872626665e-06, - "loss": 0.1756, + "loss": 0.1767, "step": 3187 }, { "epoch": 0.4361149110807114, - "grad_norm": 1.3144702150526968, + "grad_norm": 1.3041403556190385, "learning_rate": 5.996781180522079e-06, - "loss": 0.2063, + "loss": 0.206, "step": 3188 }, { "epoch": 0.4362517099863201, - "grad_norm": 1.5955507938547413, + "grad_norm": 1.5722991264263013, "learning_rate": 5.994675389676817e-06, - "loss": 0.2229, + "loss": 0.2244, "step": 3189 }, { "epoch": 0.4363885088919289, - "grad_norm": 1.3734817056612894, + "grad_norm": 1.3682788155080836, "learning_rate": 5.992569415115819e-06, - "loss": 0.1767, + "loss": 0.1802, "step": 3190 }, { "epoch": 0.4365253077975376, - "grad_norm": 1.2664176410903616, + "grad_norm": 1.3177267498295935, "learning_rate": 5.9904632572280565e-06, - "loss": 0.1902, + "loss": 0.1949, "step": 3191 }, { "epoch": 0.43666210670314637, - "grad_norm": 1.4962700257088217, + "grad_norm": 1.8456271383489988, "learning_rate": 5.988356916402536e-06, - "loss": 0.2113, + "loss": 0.2273, "step": 3192 }, { "epoch": 0.4367989056087551, - "grad_norm": 1.4562371435626287, + "grad_norm": 1.438574883993082, "learning_rate": 5.986250393028296e-06, - "loss": 0.2373, + "loss": 0.2335, "step": 3193 }, { "epoch": 0.43693570451436387, - "grad_norm": 1.1173270288874588, + "grad_norm": 1.1037193327191213, "learning_rate": 5.9841436874944095e-06, - "loss": 0.1688, + "loss": 0.169, "step": 3194 }, { "epoch": 0.4370725034199726, - "grad_norm": 1.3821815284531824, + "grad_norm": 1.3755703311255651, "learning_rate": 5.982036800189983e-06, - "loss": 0.1993, + "loss": 0.199, "step": 3195 }, { "epoch": 0.4372093023255814, - "grad_norm": 1.230865364615154, + "grad_norm": 1.210727400094613, "learning_rate": 5.979929731504158e-06, - "loss": 0.1948, + "loss": 0.1957, "step": 3196 }, { "epoch": 0.43734610123119017, - "grad_norm": 1.0658583257491256, + "grad_norm": 1.0571587282107175, "learning_rate": 5.9778224818261065e-06, - "loss": 0.165, + "loss": 0.1651, "step": 3197 }, { "epoch": 0.4374829001367989, - "grad_norm": 1.4503874817256377, + "grad_norm": 1.4437718491048204, "learning_rate": 5.975715051545039e-06, - "loss": 0.2176, + "loss": 0.2178, "step": 3198 }, { "epoch": 0.43761969904240766, - "grad_norm": 1.130841409590372, + "grad_norm": 1.1263756905097666, "learning_rate": 5.973607441050194e-06, - "loss": 0.1724, + "loss": 0.1723, "step": 3199 }, { "epoch": 0.4377564979480164, - "grad_norm": 1.2520046210265194, + "grad_norm": 1.2464201417509708, "learning_rate": 5.971499650730847e-06, - "loss": 0.1829, + "loss": 0.1837, "step": 3200 }, { "epoch": 0.4377564979480164, - "eval_loss": 0.1910461187362671, - "eval_runtime": 5.9197, - "eval_samples_per_second": 5.068, - "eval_steps_per_second": 1.351, + "eval_loss": 0.1912027895450592, + "eval_runtime": 5.9371, + "eval_samples_per_second": 5.053, + "eval_steps_per_second": 1.347, "step": 3200 }, { "epoch": 0.43789329685362516, - "grad_norm": 1.231151742408053, + "grad_norm": 1.211979451929056, "learning_rate": 5.969391680976302e-06, - "loss": 0.2209, + "loss": 0.2207, "step": 3201 }, { "epoch": 0.4380300957592339, - "grad_norm": 1.2579916586398923, + "grad_norm": 1.2444441518099247, "learning_rate": 5.967283532175901e-06, - "loss": 0.2009, + "loss": 0.2003, "step": 3202 }, { "epoch": 0.43816689466484265, - "grad_norm": 1.3666298668802948, + "grad_norm": 1.2658251457177743, "learning_rate": 5.965175204719018e-06, - "loss": 0.2059, + "loss": 0.2084, "step": 3203 }, { "epoch": 0.43830369357045146, - "grad_norm": 1.2672152861080543, + "grad_norm": 1.2366612553078995, "learning_rate": 5.96306669899506e-06, - "loss": 0.2232, + "loss": 0.2198, "step": 3204 }, { "epoch": 0.4384404924760602, - "grad_norm": 1.292765937733365, + "grad_norm": 1.2786653556468155, "learning_rate": 5.960958015393465e-06, - "loss": 0.1956, + "loss": 0.1978, "step": 3205 }, { "epoch": 0.43857729138166895, - "grad_norm": 1.1370674139108383, + "grad_norm": 1.1395250815370839, "learning_rate": 5.958849154303704e-06, - "loss": 0.1722, + "loss": 0.1741, "step": 3206 }, { "epoch": 0.4387140902872777, - "grad_norm": 1.2346112947689742, + "grad_norm": 1.235334839226921, "learning_rate": 5.956740116115285e-06, - "loss": 0.1767, + "loss": 0.1772, "step": 3207 }, { "epoch": 0.43885088919288645, - "grad_norm": 1.303080414108567, + "grad_norm": 1.2843782450446108, "learning_rate": 5.954630901217742e-06, - "loss": 0.2098, + "loss": 0.2111, "step": 3208 }, { "epoch": 0.4389876880984952, - "grad_norm": 1.142025064094477, + "grad_norm": 1.1161557894779743, "learning_rate": 5.952521510000651e-06, - "loss": 0.2024, + "loss": 0.2014, "step": 3209 }, { "epoch": 0.43912448700410395, - "grad_norm": 1.1977997029674452, + "grad_norm": 1.1710019670645846, "learning_rate": 5.950411942853608e-06, - "loss": 0.184, + "loss": 0.1837, "step": 3210 }, { "epoch": 0.4392612859097127, - "grad_norm": 1.4537619308733438, + "grad_norm": 1.431683602353245, "learning_rate": 5.948302200166251e-06, - "loss": 0.1972, + "loss": 0.1965, "step": 3211 }, { "epoch": 0.4393980848153215, - "grad_norm": 1.224588833569942, + "grad_norm": 1.1890429268027718, "learning_rate": 5.9461922823282495e-06, - "loss": 0.1897, + "loss": 0.1874, "step": 3212 }, { "epoch": 0.43953488372093025, - "grad_norm": 1.1474107309182449, + "grad_norm": 1.149095694006075, "learning_rate": 5.944082189729301e-06, - "loss": 0.1792, + "loss": 0.176, "step": 3213 }, { "epoch": 0.439671682626539, - "grad_norm": 1.3337292694736518, + "grad_norm": 1.308620580401071, "learning_rate": 5.941971922759141e-06, - "loss": 0.1939, + "loss": 0.1914, "step": 3214 }, { "epoch": 0.43980848153214774, - "grad_norm": 1.1091584948085713, + "grad_norm": 1.094683977505674, "learning_rate": 5.939861481807532e-06, - "loss": 0.1697, + "loss": 0.1701, "step": 3215 }, { "epoch": 0.4399452804377565, - "grad_norm": 1.0191653991737435, + "grad_norm": 1.0070924944076307, "learning_rate": 5.93775086726427e-06, - "loss": 0.1751, + "loss": 0.1743, "step": 3216 }, { "epoch": 0.44008207934336524, - "grad_norm": 1.1750646303064491, + "grad_norm": 1.1535589995154036, "learning_rate": 5.935640079519186e-06, - "loss": 0.219, + "loss": 0.2175, "step": 3217 }, { "epoch": 0.440218878248974, - "grad_norm": 1.3165770114547986, + "grad_norm": 1.3218636421106145, "learning_rate": 5.933529118962138e-06, - "loss": 0.2139, + "loss": 0.2165, "step": 3218 }, { "epoch": 0.44035567715458274, - "grad_norm": 1.635975965442301, + "grad_norm": 1.6086567040619413, "learning_rate": 5.931417985983022e-06, - "loss": 0.2431, + "loss": 0.2428, "step": 3219 }, { "epoch": 0.44049247606019154, - "grad_norm": 1.2330439495033334, + "grad_norm": 1.2317178062151424, "learning_rate": 5.929306680971759e-06, "loss": 0.191, "step": 3220 }, { "epoch": 0.4406292749658003, - "grad_norm": 1.3523419929478944, + "grad_norm": 1.3399855728871841, "learning_rate": 5.927195204318309e-06, - "loss": 0.1865, + "loss": 0.1848, "step": 3221 }, { "epoch": 0.44076607387140904, - "grad_norm": 1.1118824844299626, + "grad_norm": 1.1059614725403089, "learning_rate": 5.925083556412658e-06, - "loss": 0.1835, + "loss": 0.1842, "step": 3222 }, { "epoch": 0.4409028727770178, - "grad_norm": 0.9612196375985245, + "grad_norm": 0.9729643720317759, "learning_rate": 5.922971737644825e-06, - "loss": 0.1885, + "loss": 0.1908, "step": 3223 }, { "epoch": 0.44103967168262653, - "grad_norm": 1.3817568647545895, + "grad_norm": 1.3668040227571205, "learning_rate": 5.920859748404863e-06, - "loss": 0.2061, + "loss": 0.2055, "step": 3224 }, { "epoch": 0.4411764705882353, - "grad_norm": 1.0415220723998992, + "grad_norm": 1.0346428673344326, "learning_rate": 5.918747589082853e-06, - "loss": 0.1657, + "loss": 0.1651, "step": 3225 }, { "epoch": 0.44131326949384403, - "grad_norm": 1.3205498926568655, + "grad_norm": 1.2725679056966273, "learning_rate": 5.916635260068909e-06, - "loss": 0.178, + "loss": 0.1755, "step": 3226 }, { "epoch": 0.4414500683994528, - "grad_norm": 1.0723013812415738, + "grad_norm": 1.056929903430041, "learning_rate": 5.914522761753179e-06, - "loss": 0.176, + "loss": 0.1742, "step": 3227 }, { "epoch": 0.4415868673050616, - "grad_norm": 1.591225961651302, + "grad_norm": 1.5906913416580557, "learning_rate": 5.912410094525835e-06, - "loss": 0.1997, + "loss": 0.2003, "step": 3228 }, { "epoch": 0.44172366621067033, - "grad_norm": 1.3064119641364116, + "grad_norm": 1.3207204242891162, "learning_rate": 5.910297258777091e-06, - "loss": 0.1707, + "loss": 0.171, "step": 3229 }, { "epoch": 0.4418604651162791, - "grad_norm": 1.120583093470388, + "grad_norm": 1.0992097409618518, "learning_rate": 5.908184254897183e-06, - "loss": 0.1834, + "loss": 0.1831, "step": 3230 }, { "epoch": 0.4419972640218878, - "grad_norm": 1.3473694194415904, + "grad_norm": 1.346566829609748, "learning_rate": 5.90607108327638e-06, - "loss": 0.1873, + "loss": 0.1871, "step": 3231 }, { "epoch": 0.4421340629274966, - "grad_norm": 1.1771451154184287, + "grad_norm": 1.1605947181448604, "learning_rate": 5.903957744304984e-06, - "loss": 0.1534, + "loss": 0.1526, "step": 3232 }, { "epoch": 0.4422708618331053, - "grad_norm": 1.274901963733443, + "grad_norm": 1.2876310759491316, "learning_rate": 5.901844238373328e-06, - "loss": 0.2038, + "loss": 0.2043, "step": 3233 }, { "epoch": 0.44240766073871407, - "grad_norm": 1.3581048735606642, + "grad_norm": 1.3574636454490743, "learning_rate": 5.899730565871774e-06, - "loss": 0.1797, + "loss": 0.181, "step": 3234 }, { "epoch": 0.4425444596443228, - "grad_norm": 1.185536174540018, + "grad_norm": 1.1867366653712654, "learning_rate": 5.897616727190715e-06, - "loss": 0.1486, + "loss": 0.1497, "step": 3235 }, { "epoch": 0.4426812585499316, - "grad_norm": 1.415568059113793, + "grad_norm": 1.4302037074284464, "learning_rate": 5.895502722720575e-06, - "loss": 0.2039, + "loss": 0.2059, "step": 3236 }, { "epoch": 0.44281805745554037, - "grad_norm": 1.2533411134069947, + "grad_norm": 1.2565512981688955, "learning_rate": 5.893388552851811e-06, - "loss": 0.1838, + "loss": 0.1843, "step": 3237 }, { "epoch": 0.4429548563611491, - "grad_norm": 1.2986028994074945, + "grad_norm": 1.3234308329940425, "learning_rate": 5.891274217974907e-06, - "loss": 0.1951, + "loss": 0.1968, "step": 3238 }, { "epoch": 0.44309165526675787, - "grad_norm": 1.2121320921532068, + "grad_norm": 1.1897934528405396, "learning_rate": 5.8891597184803796e-06, - "loss": 0.1438, + "loss": 0.1423, "step": 3239 }, { "epoch": 0.4432284541723666, - "grad_norm": 1.0449182222791675, + "grad_norm": 1.0322169191022836, "learning_rate": 5.887045054758775e-06, - "loss": 0.1837, + "loss": 0.1857, "step": 3240 }, { "epoch": 0.44336525307797536, - "grad_norm": 1.306136752102189, + "grad_norm": 1.3021571463699702, "learning_rate": 5.884930227200668e-06, - "loss": 0.1945, + "loss": 0.1908, "step": 3241 }, { "epoch": 0.4435020519835841, - "grad_norm": 1.3186087314078587, + "grad_norm": 1.281408547694619, "learning_rate": 5.882815236196669e-06, - "loss": 0.1878, + "loss": 0.1834, "step": 3242 }, { "epoch": 0.44363885088919286, - "grad_norm": 1.2114553348726365, + "grad_norm": 1.208604194624248, "learning_rate": 5.880700082137412e-06, - "loss": 0.2031, + "loss": 0.2022, "step": 3243 }, { "epoch": 0.44377564979480166, - "grad_norm": 1.2800590283736646, + "grad_norm": 1.2790495708738137, "learning_rate": 5.878584765413564e-06, - "loss": 0.1806, + "loss": 0.1828, "step": 3244 }, { "epoch": 0.4439124487004104, - "grad_norm": 1.288680892238454, + "grad_norm": 1.2886870250607472, "learning_rate": 5.876469286415826e-06, - "loss": 0.1951, + "loss": 0.1979, "step": 3245 }, { "epoch": 0.44404924760601916, - "grad_norm": 1.2644443906692115, + "grad_norm": 1.2664399035347242, "learning_rate": 5.874353645534923e-06, - "loss": 0.1896, + "loss": 0.1889, "step": 3246 }, { "epoch": 0.4441860465116279, - "grad_norm": 1.257020066930711, + "grad_norm": 1.2607059005983847, "learning_rate": 5.872237843161613e-06, - "loss": 0.2006, + "loss": 0.1995, "step": 3247 }, { "epoch": 0.44432284541723666, - "grad_norm": 1.4353605820396007, + "grad_norm": 1.4140327797691261, "learning_rate": 5.870121879686679e-06, - "loss": 0.1837, + "loss": 0.1827, "step": 3248 }, { "epoch": 0.4444596443228454, - "grad_norm": 1.3221514513794566, + "grad_norm": 1.3221891667168624, "learning_rate": 5.868005755500943e-06, - "loss": 0.2264, + "loss": 0.2281, "step": 3249 }, { "epoch": 0.44459644322845415, - "grad_norm": 1.3449355182598184, + "grad_norm": 1.3515600237489684, "learning_rate": 5.865889470995249e-06, - "loss": 0.2254, + "loss": 0.2234, "step": 3250 }, { "epoch": 0.4447332421340629, - "grad_norm": 1.160806412530664, + "grad_norm": 1.1444036585689206, "learning_rate": 5.863773026560473e-06, - "loss": 0.1676, + "loss": 0.1668, "step": 3251 }, { "epoch": 0.4448700410396717, - "grad_norm": 1.2851803586700667, + "grad_norm": 1.2654658416417992, "learning_rate": 5.8616564225875205e-06, - "loss": 0.2224, + "loss": 0.2219, "step": 3252 }, { "epoch": 0.44500683994528045, - "grad_norm": 1.3853029451845513, + "grad_norm": 1.360682530641626, "learning_rate": 5.859539659467327e-06, - "loss": 0.2136, + "loss": 0.2133, "step": 3253 }, { "epoch": 0.4451436388508892, - "grad_norm": 1.4078924685164178, + "grad_norm": 1.3767666189264105, "learning_rate": 5.857422737590858e-06, - "loss": 0.244, + "loss": 0.2426, "step": 3254 }, { "epoch": 0.44528043775649795, - "grad_norm": 1.1569690551816818, + "grad_norm": 1.1329804472611444, "learning_rate": 5.855305657349105e-06, - "loss": 0.1862, + "loss": 0.187, "step": 3255 }, { "epoch": 0.4454172366621067, - "grad_norm": 1.470085749627292, + "grad_norm": 1.4426454641077822, "learning_rate": 5.853188419133092e-06, - "loss": 0.2201, + "loss": 0.2223, "step": 3256 }, { "epoch": 0.44555403556771545, - "grad_norm": 1.363400329511432, + "grad_norm": 1.3137300139039347, "learning_rate": 5.851071023333872e-06, - "loss": 0.188, + "loss": 0.1852, "step": 3257 }, { "epoch": 0.4456908344733242, - "grad_norm": 1.3133518763010368, + "grad_norm": 1.2915749699238106, "learning_rate": 5.848953470342525e-06, - "loss": 0.1888, + "loss": 0.1862, "step": 3258 }, { "epoch": 0.44582763337893294, - "grad_norm": 1.2586732815228905, + "grad_norm": 1.2621620777002422, "learning_rate": 5.846835760550162e-06, - "loss": 0.1775, + "loss": 0.1791, "step": 3259 }, { "epoch": 0.44596443228454175, - "grad_norm": 1.168695174782638, + "grad_norm": 1.1482325337540127, "learning_rate": 5.844717894347922e-06, - "loss": 0.1731, + "loss": 0.1716, "step": 3260 }, { "epoch": 0.4461012311901505, - "grad_norm": 1.1615614455187981, + "grad_norm": 1.151408244240865, "learning_rate": 5.842599872126974e-06, - "loss": 0.1566, + "loss": 0.1547, "step": 3261 }, { "epoch": 0.44623803009575924, - "grad_norm": 1.4656632965556553, + "grad_norm": 1.4459200665380307, "learning_rate": 5.840481694278514e-06, - "loss": 0.2255, + "loss": 0.2247, "step": 3262 }, { "epoch": 0.446374829001368, - "grad_norm": 1.4710761659323819, + "grad_norm": 1.4590418018104045, "learning_rate": 5.8383633611937675e-06, - "loss": 0.2234, + "loss": 0.2261, "step": 3263 }, { "epoch": 0.44651162790697674, - "grad_norm": 1.1647634403496336, + "grad_norm": 1.164949216635791, "learning_rate": 5.836244873263989e-06, - "loss": 0.1874, + "loss": 0.1859, "step": 3264 }, { "epoch": 0.4466484268125855, - "grad_norm": 1.3397058750045405, + "grad_norm": 1.3516900505680425, "learning_rate": 5.834126230880464e-06, - "loss": 0.2011, + "loss": 0.2051, "step": 3265 }, { "epoch": 0.44678522571819423, - "grad_norm": 1.254512795780111, + "grad_norm": 1.2338742154095514, "learning_rate": 5.8320074344345e-06, - "loss": 0.1601, + "loss": 0.1592, "step": 3266 }, { "epoch": 0.446922024623803, - "grad_norm": 1.2277014747721422, + "grad_norm": 1.2048129934169913, "learning_rate": 5.829888484317441e-06, - "loss": 0.1796, + "loss": 0.1775, "step": 3267 }, { "epoch": 0.4470588235294118, - "grad_norm": 1.031048492404429, + "grad_norm": 1.0141628666581477, "learning_rate": 5.82776938092065e-06, - "loss": 0.1902, + "loss": 0.1897, "step": 3268 }, { "epoch": 0.44719562243502053, - "grad_norm": 1.17106814541927, + "grad_norm": 1.156267236945724, "learning_rate": 5.825650124635529e-06, - "loss": 0.1694, + "loss": 0.1682, "step": 3269 }, { "epoch": 0.4473324213406293, - "grad_norm": 1.0729482438646785, + "grad_norm": 1.0398050166800283, "learning_rate": 5.8235307158535e-06, - "loss": 0.1644, + "loss": 0.1645, "step": 3270 }, { "epoch": 0.44746922024623803, - "grad_norm": 1.1999179978364491, + "grad_norm": 1.1745386148409918, "learning_rate": 5.821411154966018e-06, - "loss": 0.1874, + "loss": 0.1869, "step": 3271 }, { "epoch": 0.4476060191518468, - "grad_norm": 1.2985940151486821, + "grad_norm": 1.2239355435582013, "learning_rate": 5.819291442364563e-06, - "loss": 0.1983, + "loss": 0.2013, "step": 3272 }, { "epoch": 0.4477428180574555, - "grad_norm": 1.3398768178955003, + "grad_norm": 1.3530523157977992, "learning_rate": 5.817171578440644e-06, - "loss": 0.1859, + "loss": 0.1865, "step": 3273 }, { "epoch": 0.4478796169630643, - "grad_norm": 1.1233858593358983, + "grad_norm": 1.1142286673306827, "learning_rate": 5.8150515635858e-06, - "loss": 0.1847, + "loss": 0.1846, "step": 3274 }, { "epoch": 0.448016415868673, - "grad_norm": 1.0609845070369173, + "grad_norm": 1.0463888931474088, "learning_rate": 5.812931398191593e-06, - "loss": 0.1615, + "loss": 0.1608, "step": 3275 }, { "epoch": 0.4481532147742818, - "grad_norm": 1.3957670667893753, + "grad_norm": 1.3747624301089172, "learning_rate": 5.810811082649616e-06, - "loss": 0.2346, + "loss": 0.2329, "step": 3276 }, { "epoch": 0.4482900136798906, - "grad_norm": 1.238188109393639, + "grad_norm": 1.229464473289377, "learning_rate": 5.808690617351493e-06, - "loss": 0.1941, + "loss": 0.1957, "step": 3277 }, { "epoch": 0.4484268125854993, - "grad_norm": 1.3985732786377993, + "grad_norm": 1.3797524275461812, "learning_rate": 5.80657000268887e-06, - "loss": 0.219, + "loss": 0.2191, "step": 3278 }, { "epoch": 0.44856361149110807, - "grad_norm": 1.0405025878852399, + "grad_norm": 1.0292908751237693, "learning_rate": 5.804449239053421e-06, - "loss": 0.1807, + "loss": 0.183, "step": 3279 }, { "epoch": 0.4487004103967168, - "grad_norm": 1.20033505218364, + "grad_norm": 1.1914130718749383, "learning_rate": 5.802328326836853e-06, - "loss": 0.2076, + "loss": 0.2094, "step": 3280 }, { "epoch": 0.44883720930232557, - "grad_norm": 1.399250619788059, + "grad_norm": 1.4067163816433017, "learning_rate": 5.800207266430895e-06, - "loss": 0.2352, + "loss": 0.2374, "step": 3281 }, { "epoch": 0.4489740082079343, - "grad_norm": 1.1746933535402508, + "grad_norm": 1.1622189154004974, "learning_rate": 5.798086058227304e-06, - "loss": 0.1651, + "loss": 0.1659, "step": 3282 }, { "epoch": 0.44911080711354306, - "grad_norm": 1.3497512511543464, + "grad_norm": 1.320806805371206, "learning_rate": 5.795964702617869e-06, - "loss": 0.1916, + "loss": 0.1901, "step": 3283 }, { "epoch": 0.44924760601915187, - "grad_norm": 1.2803362632266897, + "grad_norm": 1.2520978978706598, "learning_rate": 5.793843199994396e-06, - "loss": 0.2148, + "loss": 0.2131, "step": 3284 }, { "epoch": 0.4493844049247606, - "grad_norm": 1.192399339623203, + "grad_norm": 1.1733621159866665, "learning_rate": 5.791721550748733e-06, - "loss": 0.1943, + "loss": 0.1939, "step": 3285 }, { "epoch": 0.44952120383036936, - "grad_norm": 1.2475212163282603, + "grad_norm": 1.2399481516798003, "learning_rate": 5.78959975527274e-06, - "loss": 0.1924, + "loss": 0.1926, "step": 3286 }, { "epoch": 0.4496580027359781, - "grad_norm": 1.3599701206591035, + "grad_norm": 1.3179060715242445, "learning_rate": 5.787477813958315e-06, - "loss": 0.1865, + "loss": 0.1837, "step": 3287 }, { "epoch": 0.44979480164158686, - "grad_norm": 1.1362666411877886, + "grad_norm": 1.1295428374407395, "learning_rate": 5.7853557271973796e-06, - "loss": 0.1695, + "loss": 0.168, "step": 3288 }, { "epoch": 0.4499316005471956, - "grad_norm": 1.1650138344633034, + "grad_norm": 1.166427822182913, "learning_rate": 5.783233495381877e-06, - "loss": 0.187, + "loss": 0.1881, "step": 3289 }, { "epoch": 0.45006839945280436, - "grad_norm": 1.3540125404009509, + "grad_norm": 1.3543704388797069, "learning_rate": 5.781111118903785e-06, - "loss": 0.186, + "loss": 0.1862, "step": 3290 }, { "epoch": 0.4502051983584131, - "grad_norm": 1.2459661631802998, + "grad_norm": 1.235137498368794, "learning_rate": 5.778988598155104e-06, - "loss": 0.2129, + "loss": 0.2134, "step": 3291 }, { "epoch": 0.4503419972640219, - "grad_norm": 1.3110768140928764, + "grad_norm": 1.3032793482865013, "learning_rate": 5.776865933527862e-06, - "loss": 0.153, + "loss": 0.1528, "step": 3292 }, { "epoch": 0.45047879616963066, - "grad_norm": 1.3541858568492187, + "grad_norm": 1.3274225405338198, "learning_rate": 5.774743125414113e-06, - "loss": 0.2203, + "loss": 0.2215, "step": 3293 }, { "epoch": 0.4506155950752394, - "grad_norm": 1.534949838329102, + "grad_norm": 1.4953521826138299, "learning_rate": 5.772620174205938e-06, - "loss": 0.1975, + "loss": 0.1946, "step": 3294 }, { "epoch": 0.45075239398084815, - "grad_norm": 1.5136039508440617, + "grad_norm": 1.4877490713203687, "learning_rate": 5.7704970802954456e-06, - "loss": 0.218, + "loss": 0.2173, "step": 3295 }, { "epoch": 0.4508891928864569, - "grad_norm": 1.3635515365257893, + "grad_norm": 1.3503125774789886, "learning_rate": 5.7683738440747685e-06, - "loss": 0.2245, + "loss": 0.2232, "step": 3296 }, { "epoch": 0.45102599179206565, - "grad_norm": 1.2214284103241924, + "grad_norm": 1.2193591755447104, "learning_rate": 5.766250465936067e-06, - "loss": 0.1797, + "loss": 0.1813, "step": 3297 }, { "epoch": 0.4511627906976744, - "grad_norm": 1.201662039067089, + "grad_norm": 1.1856398225789944, "learning_rate": 5.764126946271526e-06, - "loss": 0.1503, + "loss": 0.1491, "step": 3298 }, { "epoch": 0.45129958960328315, - "grad_norm": 1.3358144862983545, + "grad_norm": 1.3294994084142644, "learning_rate": 5.7620032854733584e-06, - "loss": 0.2026, + "loss": 0.2028, "step": 3299 }, { "epoch": 0.45143638850889195, - "grad_norm": 1.222118932050591, + "grad_norm": 1.2003977106445571, "learning_rate": 5.759879483933804e-06, - "loss": 0.1676, + "loss": 0.1663, "step": 3300 }, { "epoch": 0.45143638850889195, - "eval_loss": 0.1882585883140564, - "eval_runtime": 5.926, - "eval_samples_per_second": 5.062, - "eval_steps_per_second": 1.35, + "eval_loss": 0.1882481575012207, + "eval_runtime": 5.9195, + "eval_samples_per_second": 5.068, + "eval_steps_per_second": 1.351, "step": 3300 }, { "epoch": 0.4515731874145007, - "grad_norm": 1.2205041452169196, + "grad_norm": 1.217626490507999, "learning_rate": 5.7577555420451266e-06, - "loss": 0.1871, + "loss": 0.1882, "step": 3301 }, { "epoch": 0.45170998632010945, - "grad_norm": 1.329632720724315, + "grad_norm": 1.334179163360328, "learning_rate": 5.755631460199617e-06, - "loss": 0.2286, + "loss": 0.2298, "step": 3302 }, { "epoch": 0.4518467852257182, - "grad_norm": 1.0802655855986343, + "grad_norm": 1.0892318420297544, "learning_rate": 5.75350723878959e-06, - "loss": 0.1732, + "loss": 0.174, "step": 3303 }, { "epoch": 0.45198358413132694, - "grad_norm": 1.3641136020660805, + "grad_norm": 1.344228575302156, "learning_rate": 5.751382878207388e-06, - "loss": 0.1808, + "loss": 0.1793, "step": 3304 }, { "epoch": 0.4521203830369357, - "grad_norm": 1.1636920474658705, + "grad_norm": 1.150647018790255, "learning_rate": 5.749258378845379e-06, - "loss": 0.1756, + "loss": 0.1757, "step": 3305 }, { "epoch": 0.45225718194254444, - "grad_norm": 1.1862843222684436, + "grad_norm": 1.1777710381943263, "learning_rate": 5.747133741095956e-06, - "loss": 0.186, + "loss": 0.1843, "step": 3306 }, { "epoch": 0.4523939808481532, - "grad_norm": 1.187623397720482, + "grad_norm": 1.1607745832400933, "learning_rate": 5.745008965351539e-06, - "loss": 0.1847, + "loss": 0.1842, "step": 3307 }, { "epoch": 0.452530779753762, - "grad_norm": 1.236099376587365, + "grad_norm": 1.2185990617824933, "learning_rate": 5.7428840520045704e-06, - "loss": 0.2163, + "loss": 0.2175, "step": 3308 }, { "epoch": 0.45266757865937074, - "grad_norm": 1.39973765325131, + "grad_norm": 1.335964859119346, "learning_rate": 5.740759001447522e-06, - "loss": 0.203, + "loss": 0.1988, "step": 3309 }, { "epoch": 0.4528043775649795, - "grad_norm": 1.4760628633851698, + "grad_norm": 1.4695836799875277, "learning_rate": 5.7386338140728885e-06, - "loss": 0.223, + "loss": 0.2273, "step": 3310 }, { "epoch": 0.45294117647058824, - "grad_norm": 1.077110594621712, + "grad_norm": 1.067136294201661, "learning_rate": 5.736508490273189e-06, - "loss": 0.1812, + "loss": 0.1815, "step": 3311 }, { "epoch": 0.453077975376197, - "grad_norm": 1.1884985513814823, + "grad_norm": 1.1798983181651996, "learning_rate": 5.73438303044097e-06, "loss": 0.2286, "step": 3312 }, { "epoch": 0.45321477428180573, - "grad_norm": 1.5213320179971312, + "grad_norm": 1.5054800307528804, "learning_rate": 5.732257434968801e-06, - "loss": 0.233, + "loss": 0.2342, "step": 3313 }, { "epoch": 0.4533515731874145, - "grad_norm": 1.2594242659300987, + "grad_norm": 1.2279487323272855, "learning_rate": 5.730131704249279e-06, - "loss": 0.1896, + "loss": 0.1876, "step": 3314 }, { "epoch": 0.45348837209302323, - "grad_norm": 1.3206427534075111, + "grad_norm": 1.2964992004189995, "learning_rate": 5.728005838675026e-06, - "loss": 0.1916, + "loss": 0.19, "step": 3315 }, { "epoch": 0.45362517099863203, - "grad_norm": 1.5473003110455672, + "grad_norm": 1.520752959750914, "learning_rate": 5.725879838638684e-06, - "loss": 0.2383, + "loss": 0.2369, "step": 3316 }, { "epoch": 0.4537619699042408, - "grad_norm": 1.079182966329896, + "grad_norm": 1.084104347372538, "learning_rate": 5.723753704532926e-06, - "loss": 0.1649, + "loss": 0.1651, "step": 3317 }, { "epoch": 0.45389876880984953, - "grad_norm": 1.2176785705633257, + "grad_norm": 1.2012947840556167, "learning_rate": 5.721627436750449e-06, - "loss": 0.2179, + "loss": 0.2185, "step": 3318 }, { "epoch": 0.4540355677154583, - "grad_norm": 1.4001060235228067, + "grad_norm": 1.3956149413672845, "learning_rate": 5.7195010356839695e-06, - "loss": 0.2192, + "loss": 0.2169, "step": 3319 }, { "epoch": 0.454172366621067, - "grad_norm": 1.2522490587451796, + "grad_norm": 1.248180678259734, "learning_rate": 5.717374501726234e-06, - "loss": 0.1851, + "loss": 0.1859, "step": 3320 }, { "epoch": 0.4543091655266758, - "grad_norm": 1.252812772803981, + "grad_norm": 1.2433646338595412, "learning_rate": 5.7152478352700095e-06, - "loss": 0.1462, + "loss": 0.1474, "step": 3321 }, { "epoch": 0.4544459644322845, - "grad_norm": 1.4751451243289353, + "grad_norm": 1.4286110126818699, "learning_rate": 5.713121036708092e-06, - "loss": 0.2235, + "loss": 0.2216, "step": 3322 }, { "epoch": 0.45458276333789327, - "grad_norm": 1.0072870348934109, + "grad_norm": 1.0051953043818624, "learning_rate": 5.7109941064332985e-06, - "loss": 0.1528, + "loss": 0.1554, "step": 3323 }, { "epoch": 0.4547195622435021, - "grad_norm": 1.076398606977897, + "grad_norm": 1.0660025554049506, "learning_rate": 5.708867044838469e-06, - "loss": 0.1541, + "loss": 0.1526, "step": 3324 }, { "epoch": 0.4548563611491108, - "grad_norm": 1.3039897402643077, + "grad_norm": 1.2846063306074944, "learning_rate": 5.706739852316475e-06, - "loss": 0.1869, + "loss": 0.187, "step": 3325 }, { "epoch": 0.45499316005471957, - "grad_norm": 1.1479034459992552, + "grad_norm": 1.1290031491687416, "learning_rate": 5.704612529260205e-06, - "loss": 0.1837, + "loss": 0.1842, "step": 3326 }, { "epoch": 0.4551299589603283, - "grad_norm": 1.2647415076618063, + "grad_norm": 1.249539455219566, "learning_rate": 5.702485076062572e-06, - "loss": 0.2018, + "loss": 0.2012, "step": 3327 }, { "epoch": 0.45526675786593707, - "grad_norm": 1.2330280807829093, + "grad_norm": 1.2091256154743257, "learning_rate": 5.700357493116517e-06, - "loss": 0.1997, + "loss": 0.2007, "step": 3328 }, { "epoch": 0.4554035567715458, - "grad_norm": 1.348861835933869, + "grad_norm": 1.2965507572078434, "learning_rate": 5.6982297808150015e-06, - "loss": 0.1974, + "loss": 0.1966, "step": 3329 }, { "epoch": 0.45554035567715456, - "grad_norm": 0.9633619512685094, + "grad_norm": 0.9731543806367237, "learning_rate": 5.6961019395510135e-06, - "loss": 0.1468, + "loss": 0.1486, "step": 3330 }, { "epoch": 0.4556771545827633, - "grad_norm": 1.2759857559222791, + "grad_norm": 1.2517109078423398, "learning_rate": 5.693973969717562e-06, - "loss": 0.1775, + "loss": 0.1758, "step": 3331 }, { "epoch": 0.4558139534883721, - "grad_norm": 1.1654437449246984, + "grad_norm": 1.1725315574590167, "learning_rate": 5.691845871707682e-06, - "loss": 0.1813, + "loss": 0.1832, "step": 3332 }, { "epoch": 0.45595075239398086, - "grad_norm": 1.1615986103599667, + "grad_norm": 1.1316236689872272, "learning_rate": 5.6897176459144325e-06, - "loss": 0.2044, + "loss": 0.2052, "step": 3333 }, { "epoch": 0.4560875512995896, - "grad_norm": 1.3764935568116863, + "grad_norm": 1.3736286164559657, "learning_rate": 5.687589292730894e-06, - "loss": 0.208, + "loss": 0.2084, "step": 3334 }, { "epoch": 0.45622435020519836, - "grad_norm": 1.4810335374590327, + "grad_norm": 1.4914830185647907, "learning_rate": 5.685460812550172e-06, - "loss": 0.2204, + "loss": 0.2243, "step": 3335 }, { "epoch": 0.4563611491108071, - "grad_norm": 1.2743319719277602, + "grad_norm": 1.257427157190485, "learning_rate": 5.683332205765395e-06, - "loss": 0.1937, + "loss": 0.1965, "step": 3336 }, { "epoch": 0.45649794801641586, - "grad_norm": 1.5058953260357577, + "grad_norm": 1.494647900906281, "learning_rate": 5.681203472769714e-06, - "loss": 0.258, + "loss": 0.2578, "step": 3337 }, { "epoch": 0.4566347469220246, - "grad_norm": 1.403783839364117, + "grad_norm": 1.3896952138057352, "learning_rate": 5.679074613956307e-06, - "loss": 0.2043, + "loss": 0.2044, "step": 3338 }, { "epoch": 0.45677154582763335, - "grad_norm": 1.176603287506038, + "grad_norm": 1.1683266935624395, "learning_rate": 5.676945629718369e-06, - "loss": 0.2019, + "loss": 0.1987, "step": 3339 }, { "epoch": 0.45690834473324216, - "grad_norm": 1.0610100454176525, + "grad_norm": 1.0614516465599386, "learning_rate": 5.674816520449123e-06, "loss": 0.1672, "step": 3340 }, { "epoch": 0.4570451436388509, - "grad_norm": 1.2535304751488863, + "grad_norm": 1.2495578332035218, "learning_rate": 5.6726872865418145e-06, "loss": 0.2072, "step": 3341 }, { "epoch": 0.45718194254445965, - "grad_norm": 1.568895885448379, + "grad_norm": 1.5562372990125002, "learning_rate": 5.670557928389712e-06, - "loss": 0.2434, + "loss": 0.2449, "step": 3342 }, { "epoch": 0.4573187414500684, - "grad_norm": 1.175155090130706, + "grad_norm": 1.1510789628666367, "learning_rate": 5.668428446386104e-06, - "loss": 0.1866, + "loss": 0.1853, "step": 3343 }, { "epoch": 0.45745554035567715, - "grad_norm": 1.0678139680588519, + "grad_norm": 1.0650856219093583, "learning_rate": 5.666298840924305e-06, - "loss": 0.176, + "loss": 0.1751, "step": 3344 }, { "epoch": 0.4575923392612859, - "grad_norm": 1.413082604203905, + "grad_norm": 1.3744315537637866, "learning_rate": 5.664169112397652e-06, - "loss": 0.2134, + "loss": 0.2154, "step": 3345 }, { "epoch": 0.45772913816689464, - "grad_norm": 1.1967014291762075, + "grad_norm": 1.165599179891146, "learning_rate": 5.662039261199503e-06, - "loss": 0.1785, + "loss": 0.1781, "step": 3346 }, { "epoch": 0.4578659370725034, - "grad_norm": 1.3360201363239623, + "grad_norm": 1.3321692491402297, "learning_rate": 5.659909287723241e-06, - "loss": 0.1756, + "loss": 0.1773, "step": 3347 }, { "epoch": 0.4580027359781122, - "grad_norm": 1.171626970287134, + "grad_norm": 1.1822833022315618, "learning_rate": 5.6577791923622686e-06, - "loss": 0.1929, + "loss": 0.194, "step": 3348 }, { "epoch": 0.45813953488372094, - "grad_norm": 1.5267414643982016, + "grad_norm": 1.5021249860304822, "learning_rate": 5.655648975510014e-06, - "loss": 0.247, + "loss": 0.2505, "step": 3349 }, { "epoch": 0.4582763337893297, - "grad_norm": 1.434395680105078, + "grad_norm": 1.722174949028342, "learning_rate": 5.653518637559927e-06, - "loss": 0.1825, + "loss": 0.1815, "step": 3350 }, { "epoch": 0.45841313269493844, - "grad_norm": 1.085826640226157, + "grad_norm": 1.0743682544232638, "learning_rate": 5.651388178905479e-06, - "loss": 0.163, + "loss": 0.1626, "step": 3351 }, { "epoch": 0.4585499316005472, - "grad_norm": 1.2308621750257398, + "grad_norm": 1.21753847025959, "learning_rate": 5.649257599940164e-06, - "loss": 0.1951, + "loss": 0.1959, "step": 3352 }, { "epoch": 0.45868673050615594, - "grad_norm": 1.2762981765794368, + "grad_norm": 1.269211705108949, "learning_rate": 5.647126901057497e-06, - "loss": 0.222, + "loss": 0.2229, "step": 3353 }, { "epoch": 0.4588235294117647, - "grad_norm": 1.0884080244415573, + "grad_norm": 1.0716170681125372, "learning_rate": 5.644996082651018e-06, - "loss": 0.1672, + "loss": 0.1673, "step": 3354 }, { "epoch": 0.45896032831737343, - "grad_norm": 1.3202538682146574, + "grad_norm": 1.2976580937414879, "learning_rate": 5.642865145114286e-06, - "loss": 0.2223, + "loss": 0.2206, "step": 3355 }, { "epoch": 0.45909712722298224, - "grad_norm": 1.559375331077204, + "grad_norm": 1.553644580434411, "learning_rate": 5.6407340888408835e-06, - "loss": 0.2013, + "loss": 0.2019, "step": 3356 }, { "epoch": 0.459233926128591, - "grad_norm": 1.5137424743088275, + "grad_norm": 1.5033825091520052, "learning_rate": 5.638602914224416e-06, - "loss": 0.2266, + "loss": 0.229, "step": 3357 }, { "epoch": 0.45937072503419973, - "grad_norm": 1.2049657535718559, + "grad_norm": 1.2245709545120955, "learning_rate": 5.6364716216585084e-06, - "loss": 0.1678, + "loss": 0.1689, "step": 3358 }, { "epoch": 0.4595075239398085, - "grad_norm": 1.1534321555586633, + "grad_norm": 1.148432111969797, "learning_rate": 5.63434021153681e-06, - "loss": 0.1993, + "loss": 0.2002, "step": 3359 }, { "epoch": 0.45964432284541723, - "grad_norm": 1.2759552096820324, + "grad_norm": 1.2596650567988588, "learning_rate": 5.632208684252991e-06, - "loss": 0.1885, + "loss": 0.1871, "step": 3360 }, { "epoch": 0.459781121751026, - "grad_norm": 1.1791524616451023, + "grad_norm": 1.1644190878764509, "learning_rate": 5.630077040200741e-06, - "loss": 0.1672, + "loss": 0.1665, "step": 3361 }, { "epoch": 0.4599179206566347, - "grad_norm": 1.362485878365452, + "grad_norm": 1.329747721674827, "learning_rate": 5.627945279773774e-06, - "loss": 0.191, + "loss": 0.1897, "step": 3362 }, { "epoch": 0.4600547195622435, - "grad_norm": 1.2338814576239827, + "grad_norm": 1.214627200329746, "learning_rate": 5.625813403365825e-06, - "loss": 0.1673, + "loss": 0.1666, "step": 3363 }, { "epoch": 0.4601915184678523, - "grad_norm": 1.3238849845676266, + "grad_norm": 1.3083159256025954, "learning_rate": 5.623681411370646e-06, - "loss": 0.185, + "loss": 0.1864, "step": 3364 }, { "epoch": 0.460328317373461, - "grad_norm": 1.1003920046808153, + "grad_norm": 1.0809949935952632, "learning_rate": 5.621549304182019e-06, - "loss": 0.1927, + "loss": 0.1916, "step": 3365 }, { "epoch": 0.4604651162790698, - "grad_norm": 1.4899993713292734, + "grad_norm": 1.4615453480146958, "learning_rate": 5.61941708219374e-06, - "loss": 0.2393, + "loss": 0.2391, "step": 3366 }, { "epoch": 0.4606019151846785, - "grad_norm": 1.417443176347295, + "grad_norm": 1.4184824544457224, "learning_rate": 5.617284745799629e-06, - "loss": 0.2326, + "loss": 0.2327, "step": 3367 }, { "epoch": 0.46073871409028727, - "grad_norm": 1.3161691226943923, + "grad_norm": 1.310163121653236, "learning_rate": 5.615152295393529e-06, - "loss": 0.1906, + "loss": 0.1913, "step": 3368 }, { "epoch": 0.460875512995896, - "grad_norm": 1.2800094971400255, + "grad_norm": 1.2340053267028193, "learning_rate": 5.613019731369299e-06, - "loss": 0.1764, + "loss": 0.1754, "step": 3369 }, { "epoch": 0.46101231190150477, - "grad_norm": 1.4313862900905459, + "grad_norm": 1.3954248281971304, "learning_rate": 5.610887054120823e-06, - "loss": 0.2057, + "loss": 0.2004, "step": 3370 }, { "epoch": 0.4611491108071135, - "grad_norm": 1.3742601074597718, + "grad_norm": 1.374876067892269, "learning_rate": 5.608754264042003e-06, - "loss": 0.1944, + "loss": 0.1948, "step": 3371 }, { "epoch": 0.4612859097127223, - "grad_norm": 1.1637447763335083, + "grad_norm": 1.1433814027457703, "learning_rate": 5.606621361526765e-06, - "loss": 0.1739, + "loss": 0.1722, "step": 3372 }, { "epoch": 0.46142270861833107, - "grad_norm": 0.8443616002112142, + "grad_norm": 0.9366114998907394, "learning_rate": 5.604488346969055e-06, - "loss": 0.1432, + "loss": 0.1447, "step": 3373 }, { "epoch": 0.4615595075239398, - "grad_norm": 1.23628798677863, + "grad_norm": 1.2100886584892157, "learning_rate": 5.602355220762838e-06, - "loss": 0.1578, + "loss": 0.1571, "step": 3374 }, { "epoch": 0.46169630642954856, - "grad_norm": 0.9415898769873317, + "grad_norm": 0.9231829507537271, "learning_rate": 5.600221983302104e-06, - "loss": 0.146, + "loss": 0.1455, "step": 3375 }, { "epoch": 0.4618331053351573, - "grad_norm": 1.129028931004288, + "grad_norm": 1.114323097200365, "learning_rate": 5.598088634980855e-06, - "loss": 0.1859, + "loss": 0.1852, "step": 3376 }, { "epoch": 0.46196990424076606, - "grad_norm": 1.0564542180979877, + "grad_norm": 1.0379309308017788, "learning_rate": 5.595955176193122e-06, - "loss": 0.1608, + "loss": 0.1631, "step": 3377 }, { "epoch": 0.4621067031463748, - "grad_norm": 1.2135970221222188, + "grad_norm": 1.1993841456160164, "learning_rate": 5.593821607332952e-06, - "loss": 0.1826, + "loss": 0.1836, "step": 3378 }, { "epoch": 0.46224350205198356, - "grad_norm": 1.0644464548932704, + "grad_norm": 1.058680856657342, "learning_rate": 5.591687928794414e-06, - "loss": 0.1695, + "loss": 0.1676, "step": 3379 }, { "epoch": 0.46238030095759236, - "grad_norm": 0.9349554440917975, + "grad_norm": 0.9203798487284831, "learning_rate": 5.589554140971595e-06, - "loss": 0.1736, + "loss": 0.1731, "step": 3380 }, { "epoch": 0.4625170998632011, - "grad_norm": 1.3030689968733724, + "grad_norm": 1.289893974837853, "learning_rate": 5.5874202442586045e-06, - "loss": 0.201, + "loss": 0.2024, "step": 3381 }, { "epoch": 0.46265389876880986, - "grad_norm": 1.299251641364658, + "grad_norm": 1.2737037912140985, "learning_rate": 5.585286239049575e-06, - "loss": 0.1875, + "loss": 0.1847, "step": 3382 }, { "epoch": 0.4627906976744186, - "grad_norm": 1.4913191622429935, + "grad_norm": 1.4761215795222256, "learning_rate": 5.583152125738651e-06, - "loss": 0.2396, + "loss": 0.2392, "step": 3383 }, { "epoch": 0.46292749658002735, - "grad_norm": 1.4133557946288207, + "grad_norm": 1.3999343364525938, "learning_rate": 5.581017904720003e-06, - "loss": 0.1764, + "loss": 0.1757, "step": 3384 }, { "epoch": 0.4630642954856361, - "grad_norm": 1.0432173418013564, + "grad_norm": 1.0844262582920936, "learning_rate": 5.5788835763878214e-06, - "loss": 0.1673, + "loss": 0.1717, "step": 3385 }, { "epoch": 0.46320109439124485, - "grad_norm": 1.270858373264158, + "grad_norm": 1.2575712963667283, "learning_rate": 5.576749141136313e-06, - "loss": 0.2235, + "loss": 0.2217, "step": 3386 }, { "epoch": 0.4633378932968536, - "grad_norm": 1.4041606752104363, + "grad_norm": 1.3986809512993883, "learning_rate": 5.574614599359707e-06, - "loss": 0.2138, + "loss": 0.2133, "step": 3387 }, { "epoch": 0.4634746922024624, - "grad_norm": 1.2629902850247705, + "grad_norm": 1.2571863400422738, "learning_rate": 5.57247995145225e-06, - "loss": 0.172, + "loss": 0.173, "step": 3388 }, { "epoch": 0.46361149110807115, - "grad_norm": 1.2886588628232867, + "grad_norm": 1.2727444811014883, "learning_rate": 5.570345197808212e-06, - "loss": 0.1746, + "loss": 0.1734, "step": 3389 }, { "epoch": 0.4637482900136799, - "grad_norm": 1.217982855270148, + "grad_norm": 1.230615366022887, "learning_rate": 5.5682103388218815e-06, - "loss": 0.1708, + "loss": 0.1719, "step": 3390 }, { "epoch": 0.46388508891928865, - "grad_norm": 1.256350081982684, + "grad_norm": 1.219494764201112, "learning_rate": 5.566075374887561e-06, - "loss": 0.2218, + "loss": 0.2217, "step": 3391 }, { "epoch": 0.4640218878248974, - "grad_norm": 1.1863114701446547, + "grad_norm": 1.1486360763004195, "learning_rate": 5.563940306399579e-06, - "loss": 0.184, + "loss": 0.1824, "step": 3392 }, { "epoch": 0.46415868673050614, - "grad_norm": 1.2480415356460266, + "grad_norm": 1.2530381048692107, "learning_rate": 5.561805133752282e-06, - "loss": 0.1736, + "loss": 0.1763, "step": 3393 }, { "epoch": 0.4642954856361149, - "grad_norm": 1.3120154921752216, + "grad_norm": 1.2861796554513647, "learning_rate": 5.559669857340031e-06, - "loss": 0.1957, + "loss": 0.1962, "step": 3394 }, { "epoch": 0.46443228454172364, - "grad_norm": 1.4982033504437375, + "grad_norm": 1.5457712910641155, "learning_rate": 5.557534477557214e-06, - "loss": 0.2271, + "loss": 0.224, "step": 3395 }, { "epoch": 0.46456908344733244, - "grad_norm": 1.3174481495242072, + "grad_norm": 1.2953734019051333, "learning_rate": 5.55539899479823e-06, - "loss": 0.1812, + "loss": 0.1807, "step": 3396 }, { "epoch": 0.4647058823529412, - "grad_norm": 1.2158404722572134, + "grad_norm": 1.2178434964249605, "learning_rate": 5.553263409457504e-06, - "loss": 0.1888, + "loss": 0.189, "step": 3397 }, { "epoch": 0.46484268125854994, - "grad_norm": 1.3381630870921868, + "grad_norm": 1.2874373070974996, "learning_rate": 5.5511277219294765e-06, - "loss": 0.1883, + "loss": 0.1884, "step": 3398 }, { "epoch": 0.4649794801641587, - "grad_norm": 1.124404872813765, + "grad_norm": 1.118169464695528, "learning_rate": 5.548991932608607e-06, - "loss": 0.1849, + "loss": 0.1871, "step": 3399 }, { "epoch": 0.46511627906976744, - "grad_norm": 1.3302818688964002, + "grad_norm": 1.3186128810224198, "learning_rate": 5.546856041889374e-06, - "loss": 0.2089, + "loss": 0.2099, "step": 3400 }, { "epoch": 0.46511627906976744, - "eval_loss": 0.19087722897529602, - "eval_runtime": 5.926, - "eval_samples_per_second": 5.062, - "eval_steps_per_second": 1.35, + "eval_loss": 0.1903015673160553, + "eval_runtime": 5.9144, + "eval_samples_per_second": 5.072, + "eval_steps_per_second": 1.353, "step": 3400 }, { "epoch": 0.4652530779753762, - "grad_norm": 1.302867522349318, + "grad_norm": 1.292492793655125, "learning_rate": 5.544720050166273e-06, - "loss": 0.1939, + "loss": 0.1935, "step": 3401 }, { "epoch": 0.46538987688098493, - "grad_norm": 1.3674220993542303, + "grad_norm": 1.331181966078494, "learning_rate": 5.54258395783382e-06, - "loss": 0.1978, + "loss": 0.1954, "step": 3402 }, { "epoch": 0.4655266757865937, - "grad_norm": 1.4277747533161615, + "grad_norm": 1.4245331340627383, "learning_rate": 5.540447765286553e-06, - "loss": 0.2021, + "loss": 0.2017, "step": 3403 }, { "epoch": 0.4656634746922025, - "grad_norm": 1.4783769879516324, + "grad_norm": 1.4576119967828933, "learning_rate": 5.538311472919022e-06, - "loss": 0.2159, + "loss": 0.2143, "step": 3404 }, { "epoch": 0.46580027359781123, - "grad_norm": 1.421967059640124, + "grad_norm": 1.4262914695704938, "learning_rate": 5.5361750811258e-06, - "loss": 0.2366, + "loss": 0.238, "step": 3405 }, { "epoch": 0.46593707250342, - "grad_norm": 1.2515534487121491, + "grad_norm": 1.2288206708874059, "learning_rate": 5.534038590301476e-06, - "loss": 0.1854, + "loss": 0.1818, "step": 3406 }, { "epoch": 0.46607387140902873, - "grad_norm": 1.27778650683951, + "grad_norm": 1.262939234494832, "learning_rate": 5.531902000840659e-06, - "loss": 0.2047, + "loss": 0.2043, "step": 3407 }, { "epoch": 0.4662106703146375, - "grad_norm": 1.3841534273520166, + "grad_norm": 1.3548662575558188, "learning_rate": 5.529765313137974e-06, - "loss": 0.2043, + "loss": 0.2031, "step": 3408 }, { "epoch": 0.4663474692202462, - "grad_norm": 1.2930476146139935, + "grad_norm": 1.278895362800587, "learning_rate": 5.5276285275880645e-06, - "loss": 0.1826, + "loss": 0.1817, "step": 3409 }, { "epoch": 0.466484268125855, - "grad_norm": 1.2002049169376037, + "grad_norm": 1.2032297540465093, "learning_rate": 5.5254916445855975e-06, - "loss": 0.1576, + "loss": 0.1577, "step": 3410 }, { "epoch": 0.4666210670314637, - "grad_norm": 1.068326634318436, + "grad_norm": 1.0595602179263763, "learning_rate": 5.5233546645252504e-06, "loss": 0.19, "step": 3411 }, { "epoch": 0.4667578659370725, - "grad_norm": 1.2011741420696285, + "grad_norm": 1.2043822337188737, "learning_rate": 5.521217587801719e-06, - "loss": 0.1974, + "loss": 0.1979, "step": 3412 }, { "epoch": 0.4668946648426813, - "grad_norm": 1.2346794139870307, + "grad_norm": 1.2169561720861817, "learning_rate": 5.519080414809725e-06, - "loss": 0.1545, + "loss": 0.1553, "step": 3413 }, { "epoch": 0.46703146374829, - "grad_norm": 1.2420467360867327, + "grad_norm": 1.216109464789311, "learning_rate": 5.516943145944002e-06, - "loss": 0.2064, + "loss": 0.2059, "step": 3414 }, { "epoch": 0.46716826265389877, - "grad_norm": 1.057197920906719, + "grad_norm": 1.0486574637595516, "learning_rate": 5.514805781599298e-06, - "loss": 0.1595, + "loss": 0.1613, "step": 3415 }, { "epoch": 0.4673050615595075, - "grad_norm": 1.2724981969554212, + "grad_norm": 1.2360385756106524, "learning_rate": 5.512668322170385e-06, - "loss": 0.1975, + "loss": 0.1961, "step": 3416 }, { "epoch": 0.46744186046511627, - "grad_norm": 1.3331730669033002, + "grad_norm": 1.315863394352558, "learning_rate": 5.510530768052047e-06, - "loss": 0.2243, + "loss": 0.2248, "step": 3417 }, { "epoch": 0.467578659370725, - "grad_norm": 1.1570408321707335, + "grad_norm": 1.1436782185950336, "learning_rate": 5.508393119639094e-06, - "loss": 0.1777, + "loss": 0.178, "step": 3418 }, { "epoch": 0.46771545827633376, - "grad_norm": 1.162968519857941, + "grad_norm": 1.193318374687194, "learning_rate": 5.5062553773263435e-06, - "loss": 0.1583, + "loss": 0.1552, "step": 3419 }, { "epoch": 0.46785225718194257, - "grad_norm": 1.3333600859560573, + "grad_norm": 1.3170376776162507, "learning_rate": 5.504117541508636e-06, - "loss": 0.1906, + "loss": 0.1921, "step": 3420 }, { "epoch": 0.4679890560875513, - "grad_norm": 1.0594001697716804, + "grad_norm": 1.0363532407591696, "learning_rate": 5.501979612580829e-06, - "loss": 0.182, + "loss": 0.1806, "step": 3421 }, { "epoch": 0.46812585499316006, - "grad_norm": 1.4141454243991316, + "grad_norm": 1.3473587206508701, "learning_rate": 5.499841590937796e-06, - "loss": 0.205, + "loss": 0.2016, "step": 3422 }, { "epoch": 0.4682626538987688, - "grad_norm": 1.261987674732445, + "grad_norm": 1.250146859953726, "learning_rate": 5.497703476974426e-06, - "loss": 0.1968, + "loss": 0.1958, "step": 3423 }, { "epoch": 0.46839945280437756, - "grad_norm": 1.3658355292936573, + "grad_norm": 1.3312072312895304, "learning_rate": 5.495565271085628e-06, - "loss": 0.1948, + "loss": 0.1924, "step": 3424 }, { "epoch": 0.4685362517099863, - "grad_norm": 1.2897552745959975, + "grad_norm": 1.2536752358193193, "learning_rate": 5.493426973666328e-06, - "loss": 0.1917, + "loss": 0.1889, "step": 3425 }, { "epoch": 0.46867305061559505, - "grad_norm": 1.1400853736096428, + "grad_norm": 1.1094329211169973, "learning_rate": 5.491288585111467e-06, "loss": 0.1843, "step": 3426 }, { "epoch": 0.4688098495212038, - "grad_norm": 1.111874090914276, + "grad_norm": 1.0828448499409697, "learning_rate": 5.489150105816003e-06, - "loss": 0.1937, + "loss": 0.1924, "step": 3427 }, { "epoch": 0.4689466484268126, - "grad_norm": 1.644443499260094, + "grad_norm": 1.6019223415762789, "learning_rate": 5.487011536174912e-06, - "loss": 0.2084, + "loss": 0.2086, "step": 3428 }, { "epoch": 0.46908344733242135, - "grad_norm": 1.1104889098106812, + "grad_norm": 1.0877333465166363, "learning_rate": 5.484872876583186e-06, - "loss": 0.1746, + "loss": 0.1739, "step": 3429 }, { "epoch": 0.4692202462380301, - "grad_norm": 1.3972547568856521, + "grad_norm": 1.4262564965326392, "learning_rate": 5.482734127435835e-06, - "loss": 0.2546, + "loss": 0.2543, "step": 3430 }, { "epoch": 0.46935704514363885, - "grad_norm": 1.3303105270530693, + "grad_norm": 1.3381813414188635, "learning_rate": 5.4805952891278815e-06, - "loss": 0.1968, + "loss": 0.1958, "step": 3431 }, { "epoch": 0.4694938440492476, - "grad_norm": 1.3926472272173522, + "grad_norm": 1.3898669291411168, "learning_rate": 5.4784563620543685e-06, - "loss": 0.2183, + "loss": 0.2216, "step": 3432 }, { "epoch": 0.46963064295485635, - "grad_norm": 1.3272435156850224, + "grad_norm": 1.2954942372243143, "learning_rate": 5.476317346610355e-06, - "loss": 0.1717, + "loss": 0.1691, "step": 3433 }, { "epoch": 0.4697674418604651, - "grad_norm": 1.3897150284428685, + "grad_norm": 1.3702223312741677, "learning_rate": 5.4741782431909144e-06, - "loss": 0.2183, + "loss": 0.2174, "step": 3434 }, { "epoch": 0.46990424076607384, - "grad_norm": 1.4643835214940917, + "grad_norm": 1.4626043811790177, "learning_rate": 5.472039052191138e-06, - "loss": 0.2151, + "loss": 0.2163, "step": 3435 }, { "epoch": 0.47004103967168265, - "grad_norm": 1.1207720023853525, + "grad_norm": 1.0970206804754454, "learning_rate": 5.469899774006131e-06, - "loss": 0.179, + "loss": 0.1787, "step": 3436 }, { "epoch": 0.4701778385772914, - "grad_norm": 1.2309764981192237, + "grad_norm": 1.2167133802864616, "learning_rate": 5.467760409031018e-06, - "loss": 0.1984, + "loss": 0.196, "step": 3437 }, { "epoch": 0.47031463748290014, - "grad_norm": 1.235279256001014, + "grad_norm": 1.2136056154638788, "learning_rate": 5.4656209576609385e-06, - "loss": 0.2034, + "loss": 0.2067, "step": 3438 }, { "epoch": 0.4704514363885089, - "grad_norm": 1.4040274691726256, + "grad_norm": 1.3902635471325995, "learning_rate": 5.463481420291045e-06, - "loss": 0.2132, + "loss": 0.2119, "step": 3439 }, { "epoch": 0.47058823529411764, - "grad_norm": 1.286009263254743, + "grad_norm": 1.241622068417493, "learning_rate": 5.46134179731651e-06, - "loss": 0.1891, + "loss": 0.1881, "step": 3440 }, { "epoch": 0.4707250341997264, - "grad_norm": 1.4122309596419538, + "grad_norm": 1.3858734472450804, "learning_rate": 5.459202089132521e-06, - "loss": 0.2509, + "loss": 0.2522, "step": 3441 }, { "epoch": 0.47086183310533514, - "grad_norm": 1.0541317603380391, + "grad_norm": 1.0460250677842344, "learning_rate": 5.457062296134279e-06, - "loss": 0.1523, + "loss": 0.1521, "step": 3442 }, { "epoch": 0.4709986320109439, - "grad_norm": 1.1307557486532511, + "grad_norm": 1.1063804307833318, "learning_rate": 5.454922418717002e-06, - "loss": 0.1938, + "loss": 0.195, "step": 3443 }, { "epoch": 0.4711354309165527, - "grad_norm": 1.1778338122158887, + "grad_norm": 1.1792811710772872, "learning_rate": 5.4527824572759234e-06, - "loss": 0.1817, + "loss": 0.1834, "step": 3444 }, { "epoch": 0.47127222982216144, - "grad_norm": 1.515826831286948, + "grad_norm": 1.512345348057205, "learning_rate": 5.450642412206294e-06, - "loss": 0.2156, + "loss": 0.213, "step": 3445 }, { "epoch": 0.4714090287277702, - "grad_norm": 1.3270929866421233, + "grad_norm": 1.2374830196675095, "learning_rate": 5.448502283903377e-06, - "loss": 0.2037, + "loss": 0.2014, "step": 3446 }, { "epoch": 0.47154582763337893, - "grad_norm": 1.1847793995867524, + "grad_norm": 1.1750322048309347, "learning_rate": 5.446362072762451e-06, - "loss": 0.165, + "loss": 0.1614, "step": 3447 }, { "epoch": 0.4716826265389877, - "grad_norm": 1.1422483082012413, + "grad_norm": 1.1371989275696783, "learning_rate": 5.444221779178815e-06, - "loss": 0.1717, + "loss": 0.1734, "step": 3448 }, { "epoch": 0.47181942544459643, - "grad_norm": 1.214032512623294, + "grad_norm": 1.1797287677609622, "learning_rate": 5.4420814035477766e-06, - "loss": 0.1759, + "loss": 0.1732, "step": 3449 }, { "epoch": 0.4719562243502052, - "grad_norm": 1.3974549716401614, + "grad_norm": 1.3946780677139734, "learning_rate": 5.4399409462646625e-06, - "loss": 0.2197, + "loss": 0.2203, "step": 3450 }, { "epoch": 0.4720930232558139, - "grad_norm": 1.3718826842782772, + "grad_norm": 1.3488361421987805, "learning_rate": 5.437800407724812e-06, - "loss": 0.2069, + "loss": 0.2049, "step": 3451 }, { "epoch": 0.47222982216142273, - "grad_norm": 1.389391309739686, + "grad_norm": 1.3810841516127408, "learning_rate": 5.435659788323583e-06, - "loss": 0.2213, + "loss": 0.2205, "step": 3452 }, { "epoch": 0.4723666210670315, - "grad_norm": 1.1798400181862443, + "grad_norm": 1.1745231973335843, "learning_rate": 5.433519088456345e-06, - "loss": 0.136, + "loss": 0.1355, "step": 3453 }, { "epoch": 0.4725034199726402, - "grad_norm": 1.185322826898655, + "grad_norm": 1.1750703223687848, "learning_rate": 5.431378308518482e-06, - "loss": 0.1944, + "loss": 0.1975, "step": 3454 }, { "epoch": 0.472640218878249, - "grad_norm": 1.4526531205409936, + "grad_norm": 1.4000670317677515, "learning_rate": 5.429237448905398e-06, - "loss": 0.2105, + "loss": 0.2092, "step": 3455 }, { "epoch": 0.4727770177838577, - "grad_norm": 1.2549028777107574, + "grad_norm": 1.2110157265129782, "learning_rate": 5.427096510012505e-06, - "loss": 0.1987, + "loss": 0.2009, "step": 3456 }, { "epoch": 0.47291381668946647, - "grad_norm": 1.3656089994423404, + "grad_norm": 1.3488861029230281, "learning_rate": 5.424955492235234e-06, - "loss": 0.1916, + "loss": 0.191, "step": 3457 }, { "epoch": 0.4730506155950752, - "grad_norm": 1.459021657431532, + "grad_norm": 1.447045110127747, "learning_rate": 5.422814395969029e-06, - "loss": 0.2058, + "loss": 0.2044, "step": 3458 }, { "epoch": 0.47318741450068397, - "grad_norm": 1.3474592807416395, + "grad_norm": 1.3249356509315633, "learning_rate": 5.420673221609349e-06, - "loss": 0.2137, + "loss": 0.2117, "step": 3459 }, { "epoch": 0.47332421340629277, - "grad_norm": 1.3203781626629318, + "grad_norm": 1.3945516359063455, "learning_rate": 5.418531969551666e-06, - "loss": 0.1981, + "loss": 0.1993, "step": 3460 }, { "epoch": 0.4734610123119015, - "grad_norm": 1.2598435601032687, + "grad_norm": 1.2522910871037176, "learning_rate": 5.416390640191468e-06, - "loss": 0.1894, + "loss": 0.1905, "step": 3461 }, { "epoch": 0.47359781121751027, - "grad_norm": 1.307526679258209, + "grad_norm": 1.3483106316785411, "learning_rate": 5.414249233924258e-06, - "loss": 0.1706, + "loss": 0.172, "step": 3462 }, { "epoch": 0.473734610123119, - "grad_norm": 1.2793647317537482, + "grad_norm": 1.3440719533815875, "learning_rate": 5.41210775114555e-06, - "loss": 0.1972, + "loss": 0.1992, "step": 3463 }, { "epoch": 0.47387140902872776, - "grad_norm": 1.4933001091777736, + "grad_norm": 1.460129437633282, "learning_rate": 5.409966192250878e-06, - "loss": 0.2006, + "loss": 0.1991, "step": 3464 }, { "epoch": 0.4740082079343365, - "grad_norm": 1.3809475587557363, + "grad_norm": 1.3610182453954545, "learning_rate": 5.4078245576357806e-06, - "loss": 0.1749, + "loss": 0.1735, "step": 3465 }, { "epoch": 0.47414500683994526, - "grad_norm": 1.3168153056889802, + "grad_norm": 1.312269316770182, "learning_rate": 5.40568284769582e-06, - "loss": 0.1676, + "loss": 0.168, "step": 3466 }, { "epoch": 0.474281805745554, - "grad_norm": 1.209783512149821, + "grad_norm": 1.2172345478955149, "learning_rate": 5.403541062826567e-06, - "loss": 0.1734, + "loss": 0.1742, "step": 3467 }, { "epoch": 0.4744186046511628, - "grad_norm": 1.2818426162651664, + "grad_norm": 1.2843311046669856, "learning_rate": 5.4013992034236065e-06, - "loss": 0.196, + "loss": 0.198, "step": 3468 }, { "epoch": 0.47455540355677156, - "grad_norm": 1.4116280164085682, + "grad_norm": 1.40644609423514, "learning_rate": 5.399257269882539e-06, - "loss": 0.2062, + "loss": 0.2056, "step": 3469 }, { "epoch": 0.4746922024623803, - "grad_norm": 1.210729434597752, + "grad_norm": 1.1917721090286695, "learning_rate": 5.397115262598979e-06, - "loss": 0.1792, + "loss": 0.1787, "step": 3470 }, { "epoch": 0.47482900136798906, - "grad_norm": 1.103972395097023, + "grad_norm": 1.0824860064310713, "learning_rate": 5.394973181968553e-06, - "loss": 0.1776, + "loss": 0.1758, "step": 3471 }, { "epoch": 0.4749658002735978, - "grad_norm": 1.4309631121429058, + "grad_norm": 1.5181991016104066, "learning_rate": 5.392831028386902e-06, - "loss": 0.2576, + "loss": 0.2587, "step": 3472 }, { "epoch": 0.47510259917920655, - "grad_norm": 1.1110562156377812, + "grad_norm": 1.106638965380517, "learning_rate": 5.39068880224968e-06, - "loss": 0.1675, + "loss": 0.1678, "step": 3473 }, { "epoch": 0.4752393980848153, - "grad_norm": 1.1856216132355417, + "grad_norm": 1.173891006618933, "learning_rate": 5.388546503952551e-06, - "loss": 0.2006, + "loss": 0.2007, "step": 3474 }, { "epoch": 0.47537619699042405, - "grad_norm": 1.2186974274919142, + "grad_norm": 1.2074511551611438, "learning_rate": 5.386404133891198e-06, - "loss": 0.1889, + "loss": 0.1908, "step": 3475 }, { "epoch": 0.47551299589603285, - "grad_norm": 1.2290844003507564, + "grad_norm": 1.2032110586761642, "learning_rate": 5.384261692461318e-06, - "loss": 0.189, + "loss": 0.1877, "step": 3476 }, { "epoch": 0.4756497948016416, - "grad_norm": 1.5476529035175568, + "grad_norm": 1.5245564039153838, "learning_rate": 5.382119180058615e-06, - "loss": 0.2649, + "loss": 0.2637, "step": 3477 }, { "epoch": 0.47578659370725035, - "grad_norm": 1.2185799000961708, + "grad_norm": 1.187303389493905, "learning_rate": 5.379976597078809e-06, - "loss": 0.1684, + "loss": 0.17, "step": 3478 }, { "epoch": 0.4759233926128591, - "grad_norm": 1.072770805202979, + "grad_norm": 1.0744265079690587, "learning_rate": 5.377833943917635e-06, - "loss": 0.1716, + "loss": 0.1688, "step": 3479 }, { "epoch": 0.47606019151846785, - "grad_norm": 1.3341142948927294, + "grad_norm": 1.3191138703026817, "learning_rate": 5.375691220970839e-06, - "loss": 0.1694, + "loss": 0.1684, "step": 3480 }, { "epoch": 0.4761969904240766, - "grad_norm": 1.0815718828901844, + "grad_norm": 1.077336064445407, "learning_rate": 5.37354842863418e-06, - "loss": 0.1477, + "loss": 0.1486, "step": 3481 }, { "epoch": 0.47633378932968534, - "grad_norm": 1.342416202888006, + "grad_norm": 1.333407306553623, "learning_rate": 5.371405567303428e-06, - "loss": 0.1892, + "loss": 0.1893, "step": 3482 }, { "epoch": 0.4764705882352941, - "grad_norm": 1.0507281677278857, + "grad_norm": 1.0389720706003425, "learning_rate": 5.36926263737437e-06, - "loss": 0.1647, + "loss": 0.1643, "step": 3483 }, { "epoch": 0.4766073871409029, - "grad_norm": 1.2477513793821733, + "grad_norm": 1.2229380824066873, "learning_rate": 5.367119639242804e-06, - "loss": 0.1861, + "loss": 0.1837, "step": 3484 }, { "epoch": 0.47674418604651164, - "grad_norm": 1.0091419616403372, + "grad_norm": 0.997834204902163, "learning_rate": 5.364976573304538e-06, - "loss": 0.1653, + "loss": 0.1659, "step": 3485 }, { "epoch": 0.4768809849521204, - "grad_norm": 1.4336614972169903, + "grad_norm": 1.4009285759737973, "learning_rate": 5.362833439955396e-06, - "loss": 0.2281, + "loss": 0.2238, "step": 3486 }, { "epoch": 0.47701778385772914, - "grad_norm": 1.3377670918561595, + "grad_norm": 1.3403486555062127, "learning_rate": 5.360690239591213e-06, - "loss": 0.2003, + "loss": 0.201, "step": 3487 }, { "epoch": 0.4771545827633379, - "grad_norm": 1.1671169168480007, + "grad_norm": 1.1409115450962233, "learning_rate": 5.358546972607835e-06, - "loss": 0.1621, + "loss": 0.1586, "step": 3488 }, { "epoch": 0.47729138166894663, - "grad_norm": 1.2361066716730866, + "grad_norm": 1.217638013283649, "learning_rate": 5.356403639401123e-06, - "loss": 0.2093, + "loss": 0.2081, "step": 3489 }, { "epoch": 0.4774281805745554, - "grad_norm": 1.3811825199200045, + "grad_norm": 1.371265296819214, "learning_rate": 5.354260240366948e-06, - "loss": 0.2017, + "loss": 0.2016, "step": 3490 }, { "epoch": 0.47756497948016413, - "grad_norm": 1.0962963871873361, + "grad_norm": 1.0706385194071208, "learning_rate": 5.3521167759011934e-06, - "loss": 0.1589, + "loss": 0.1591, "step": 3491 }, { "epoch": 0.47770177838577293, - "grad_norm": 1.3212283757221677, + "grad_norm": 1.3340747363209704, "learning_rate": 5.349973246399757e-06, - "loss": 0.2085, + "loss": 0.2129, "step": 3492 }, { "epoch": 0.4778385772913817, - "grad_norm": 1.0772310887562437, + "grad_norm": 1.0715311536645633, "learning_rate": 5.347829652258549e-06, - "loss": 0.1699, + "loss": 0.1686, "step": 3493 }, { "epoch": 0.47797537619699043, - "grad_norm": 1.482076030340433, + "grad_norm": 1.4665813179891196, "learning_rate": 5.345685993873484e-06, - "loss": 0.2399, + "loss": 0.2377, "step": 3494 }, { "epoch": 0.4781121751025992, - "grad_norm": 1.1249907454055037, + "grad_norm": 1.1071449614902387, "learning_rate": 5.343542271640499e-06, - "loss": 0.1868, + "loss": 0.1862, "step": 3495 }, { "epoch": 0.4782489740082079, - "grad_norm": 1.3727615838416813, + "grad_norm": 1.3565370849893035, "learning_rate": 5.341398485955533e-06, - "loss": 0.1869, + "loss": 0.1851, "step": 3496 }, { "epoch": 0.4783857729138167, - "grad_norm": 1.4857698492261595, + "grad_norm": 1.4832448714896551, "learning_rate": 5.3392546372145445e-06, - "loss": 0.2141, + "loss": 0.2154, "step": 3497 }, { "epoch": 0.4785225718194254, - "grad_norm": 1.3946521950881599, + "grad_norm": 1.3531249792101305, "learning_rate": 5.337110725813501e-06, - "loss": 0.1838, + "loss": 0.1781, "step": 3498 }, { "epoch": 0.4786593707250342, - "grad_norm": 1.5851188589847538, + "grad_norm": 1.5175229543145257, "learning_rate": 5.334966752148381e-06, - "loss": 0.2493, + "loss": 0.2458, "step": 3499 }, { "epoch": 0.478796169630643, - "grad_norm": 1.6095450625445138, + "grad_norm": 1.6028669611099389, "learning_rate": 5.332822716615173e-06, - "loss": 0.2321, + "loss": 0.2328, "step": 3500 }, { "epoch": 0.478796169630643, - "eval_loss": 0.19048380851745605, - "eval_runtime": 5.9173, + "eval_loss": 0.18975399434566498, + "eval_runtime": 5.9169, "eval_samples_per_second": 5.07, "eval_steps_per_second": 1.352, "step": 3500 }, { "epoch": 0.4789329685362517, - "grad_norm": 1.5237831709431637, + "grad_norm": 1.5137866211088564, "learning_rate": 5.33067861960988e-06, - "loss": 0.2243, + "loss": 0.2249, "step": 3501 }, { "epoch": 0.4790697674418605, - "grad_norm": 1.1401808275329217, + "grad_norm": 1.1199593682698392, "learning_rate": 5.328534461528515e-06, - "loss": 0.1586, + "loss": 0.1578, "step": 3502 }, { "epoch": 0.4792065663474692, - "grad_norm": 1.2659026290102378, + "grad_norm": 1.2366502389344771, "learning_rate": 5.326390242767102e-06, - "loss": 0.1782, + "loss": 0.1769, "step": 3503 }, { "epoch": 0.47934336525307797, - "grad_norm": 1.140996272488185, + "grad_norm": 1.104329970518257, "learning_rate": 5.324245963721676e-06, - "loss": 0.1679, + "loss": 0.1666, "step": 3504 }, { "epoch": 0.4794801641586867, - "grad_norm": 1.0678235358633306, + "grad_norm": 1.020819666822043, "learning_rate": 5.322101624788283e-06, - "loss": 0.1626, + "loss": 0.1623, "step": 3505 }, { "epoch": 0.47961696306429547, - "grad_norm": 1.3684610272665414, + "grad_norm": 1.3489516142951372, "learning_rate": 5.319957226362983e-06, - "loss": 0.2141, + "loss": 0.2106, "step": 3506 }, { "epoch": 0.4797537619699042, - "grad_norm": 1.0436713643290059, + "grad_norm": 1.0134886745562783, "learning_rate": 5.317812768841844e-06, - "loss": 0.1447, + "loss": 0.1441, "step": 3507 }, { "epoch": 0.479890560875513, - "grad_norm": 1.087165690109747, + "grad_norm": 1.1083411215811123, "learning_rate": 5.315668252620942e-06, - "loss": 0.1919, + "loss": 0.1923, "step": 3508 }, { "epoch": 0.48002735978112177, - "grad_norm": 1.5174596302362622, + "grad_norm": 1.4886995599009458, "learning_rate": 5.313523678096373e-06, - "loss": 0.2489, + "loss": 0.2538, "step": 3509 }, { "epoch": 0.4801641586867305, - "grad_norm": 1.457739164036942, + "grad_norm": 1.440160440814823, "learning_rate": 5.3113790456642345e-06, - "loss": 0.2298, + "loss": 0.2282, "step": 3510 }, { "epoch": 0.48030095759233926, - "grad_norm": 1.4474417929089496, + "grad_norm": 1.4257204851902812, "learning_rate": 5.30923435572064e-06, - "loss": 0.2234, + "loss": 0.2244, "step": 3511 }, { "epoch": 0.480437756497948, - "grad_norm": 1.3708768277311114, + "grad_norm": 1.3547343804678045, "learning_rate": 5.307089608661709e-06, - "loss": 0.1927, + "loss": 0.1923, "step": 3512 }, { "epoch": 0.48057455540355676, - "grad_norm": 1.3087668755248516, + "grad_norm": 1.284338372716432, "learning_rate": 5.304944804883578e-06, "loss": 0.1977, "step": 3513 }, { "epoch": 0.4807113543091655, - "grad_norm": 1.3052931889034818, + "grad_norm": 1.2951392803132333, "learning_rate": 5.302799944782391e-06, - "loss": 0.2163, + "loss": 0.219, "step": 3514 }, { "epoch": 0.48084815321477425, - "grad_norm": 1.2979434270201762, + "grad_norm": 1.2896115503232242, "learning_rate": 5.300655028754298e-06, - "loss": 0.185, + "loss": 0.1845, "step": 3515 }, { "epoch": 0.48098495212038306, - "grad_norm": 1.1939155154122523, + "grad_norm": 1.1762084961571195, "learning_rate": 5.298510057195466e-06, - "loss": 0.1893, + "loss": 0.1882, "step": 3516 }, { "epoch": 0.4811217510259918, - "grad_norm": 1.3797102911781174, + "grad_norm": 1.3620788070605343, "learning_rate": 5.29636503050207e-06, - "loss": 0.2026, + "loss": 0.2011, "step": 3517 }, { "epoch": 0.48125854993160055, - "grad_norm": 1.621114197630376, + "grad_norm": 1.6706231803895712, "learning_rate": 5.294219949070293e-06, - "loss": 0.2343, + "loss": 0.2442, "step": 3518 }, { "epoch": 0.4813953488372093, - "grad_norm": 1.0197824460992093, + "grad_norm": 1.0186516556605962, "learning_rate": 5.292074813296331e-06, - "loss": 0.1765, + "loss": 0.1773, "step": 3519 }, { "epoch": 0.48153214774281805, - "grad_norm": 1.1198442989662358, + "grad_norm": 1.100287477236626, "learning_rate": 5.289929623576387e-06, - "loss": 0.1814, + "loss": 0.1801, "step": 3520 }, { "epoch": 0.4816689466484268, - "grad_norm": 1.1966359335308419, + "grad_norm": 1.1892652417496823, "learning_rate": 5.287784380306678e-06, - "loss": 0.2171, + "loss": 0.2175, "step": 3521 }, { "epoch": 0.48180574555403555, - "grad_norm": 1.277917414547145, + "grad_norm": 1.257401538763741, "learning_rate": 5.2856390838834285e-06, - "loss": 0.1899, + "loss": 0.1897, "step": 3522 }, { "epoch": 0.4819425444596443, - "grad_norm": 0.9529567039991939, + "grad_norm": 0.9395102919361187, "learning_rate": 5.283493734702871e-06, - "loss": 0.1874, + "loss": 0.1859, "step": 3523 }, { "epoch": 0.4820793433652531, - "grad_norm": 1.3187307895682912, + "grad_norm": 1.3014006106941607, "learning_rate": 5.281348333161251e-06, - "loss": 0.1786, + "loss": 0.1794, "step": 3524 }, { "epoch": 0.48221614227086185, - "grad_norm": 1.3096817923639872, + "grad_norm": 1.302647934558121, "learning_rate": 5.2792028796548236e-06, - "loss": 0.1964, + "loss": 0.1978, "step": 3525 }, { "epoch": 0.4823529411764706, - "grad_norm": 1.4846407881645902, + "grad_norm": 1.4831686368805033, "learning_rate": 5.27705737457985e-06, - "loss": 0.2088, + "loss": 0.2093, "step": 3526 }, { "epoch": 0.48248974008207934, - "grad_norm": 1.2941137246630303, + "grad_norm": 1.2803380334370706, "learning_rate": 5.274911818332606e-06, - "loss": 0.1985, + "loss": 0.1955, "step": 3527 }, { "epoch": 0.4826265389876881, - "grad_norm": 1.0013897527368993, + "grad_norm": 1.0101011070679653, "learning_rate": 5.272766211309371e-06, - "loss": 0.1764, + "loss": 0.1752, "step": 3528 }, { "epoch": 0.48276333789329684, - "grad_norm": 1.319308259598112, + "grad_norm": 1.2934533749209514, "learning_rate": 5.270620553906439e-06, - "loss": 0.2002, + "loss": 0.1979, "step": 3529 }, { "epoch": 0.4829001367989056, - "grad_norm": 1.209007417617153, + "grad_norm": 1.227487200949749, "learning_rate": 5.268474846520112e-06, "loss": 0.1663, "step": 3530 }, { "epoch": 0.48303693570451434, - "grad_norm": 1.5026124025168144, + "grad_norm": 1.5043163974786355, "learning_rate": 5.266329089546698e-06, "loss": 0.2209, "step": 3531 }, { "epoch": 0.48317373461012314, - "grad_norm": 1.2358037323940114, + "grad_norm": 1.2136940972676247, "learning_rate": 5.264183283382516e-06, - "loss": 0.1634, + "loss": 0.164, "step": 3532 }, { "epoch": 0.4833105335157319, - "grad_norm": 1.1917835070448524, + "grad_norm": 1.1829760997473246, "learning_rate": 5.262037428423897e-06, - "loss": 0.194, + "loss": 0.1967, "step": 3533 }, { "epoch": 0.48344733242134064, - "grad_norm": 1.3842476830776593, + "grad_norm": 1.3720117279748587, "learning_rate": 5.259891525067179e-06, - "loss": 0.2194, + "loss": 0.218, "step": 3534 }, { "epoch": 0.4835841313269494, - "grad_norm": 1.1841181095590914, + "grad_norm": 1.1683249578086998, "learning_rate": 5.257745573708707e-06, - "loss": 0.1786, + "loss": 0.1779, "step": 3535 }, { "epoch": 0.48372093023255813, - "grad_norm": 1.5179392175397246, + "grad_norm": 1.485987721499471, "learning_rate": 5.255599574744836e-06, - "loss": 0.2489, + "loss": 0.2473, "step": 3536 }, { "epoch": 0.4838577291381669, - "grad_norm": 1.1532302754736559, + "grad_norm": 1.1312770685398335, "learning_rate": 5.2534535285719325e-06, - "loss": 0.1598, + "loss": 0.1582, "step": 3537 }, { "epoch": 0.48399452804377563, - "grad_norm": 1.3371510568376332, + "grad_norm": 1.282395283897915, "learning_rate": 5.251307435586368e-06, - "loss": 0.2076, + "loss": 0.205, "step": 3538 }, { "epoch": 0.4841313269493844, - "grad_norm": 1.4077007316793126, + "grad_norm": 1.3958293216757516, "learning_rate": 5.249161296184525e-06, - "loss": 0.2408, + "loss": 0.2433, "step": 3539 }, { "epoch": 0.4842681258549932, - "grad_norm": 1.1618435504152325, + "grad_norm": 1.1490993350074938, "learning_rate": 5.247015110762791e-06, - "loss": 0.1692, + "loss": 0.1685, "step": 3540 }, { "epoch": 0.48440492476060193, - "grad_norm": 1.4405639307379003, + "grad_norm": 1.4719377583023348, "learning_rate": 5.244868879717568e-06, - "loss": 0.2194, + "loss": 0.2233, "step": 3541 }, { "epoch": 0.4845417236662107, - "grad_norm": 1.4485807343258825, + "grad_norm": 1.433156997318999, "learning_rate": 5.242722603445261e-06, - "loss": 0.2298, + "loss": 0.2304, "step": 3542 }, { "epoch": 0.4846785225718194, - "grad_norm": 1.3602059250317926, + "grad_norm": 1.3606079765092474, "learning_rate": 5.240576282342288e-06, - "loss": 0.1881, + "loss": 0.1896, "step": 3543 }, { "epoch": 0.4848153214774282, - "grad_norm": 1.5264031718445332, + "grad_norm": 1.5210571393515546, "learning_rate": 5.238429916805071e-06, - "loss": 0.2574, + "loss": 0.2552, "step": 3544 }, { "epoch": 0.4849521203830369, - "grad_norm": 1.3387044475862844, + "grad_norm": 1.31627653868968, "learning_rate": 5.236283507230042e-06, - "loss": 0.1872, + "loss": 0.1871, "step": 3545 }, { "epoch": 0.48508891928864567, - "grad_norm": 1.5172686724552558, + "grad_norm": 1.4648948067339116, "learning_rate": 5.23413705401364e-06, - "loss": 0.2336, + "loss": 0.2313, "step": 3546 }, { "epoch": 0.4852257181942544, - "grad_norm": 1.1250057766286672, + "grad_norm": 1.142766823803481, "learning_rate": 5.231990557552317e-06, - "loss": 0.1954, + "loss": 0.1967, "step": 3547 }, { "epoch": 0.4853625170998632, - "grad_norm": 1.2060296277620808, + "grad_norm": 1.1773630210181798, "learning_rate": 5.229844018242522e-06, - "loss": 0.1934, + "loss": 0.1928, "step": 3548 }, { "epoch": 0.48549931600547197, - "grad_norm": 1.0880043182533974, + "grad_norm": 1.0870012570390806, "learning_rate": 5.227697436480726e-06, - "loss": 0.1806, + "loss": 0.1822, "step": 3549 }, { "epoch": 0.4856361149110807, - "grad_norm": 1.4777996294421991, + "grad_norm": 1.4771484453853378, "learning_rate": 5.225550812663399e-06, - "loss": 0.2407, + "loss": 0.2416, "step": 3550 }, { "epoch": 0.48577291381668947, - "grad_norm": 1.5287032139407313, + "grad_norm": 1.4607120590891474, "learning_rate": 5.22340414718702e-06, - "loss": 0.2285, + "loss": 0.225, "step": 3551 }, { "epoch": 0.4859097127222982, - "grad_norm": 1.0468087830219597, + "grad_norm": 1.0417687273433607, "learning_rate": 5.221257440448078e-06, - "loss": 0.1725, + "loss": 0.1721, "step": 3552 }, { "epoch": 0.48604651162790696, - "grad_norm": 1.538118464021185, + "grad_norm": 1.5241836285000654, "learning_rate": 5.2191106928430646e-06, - "loss": 0.2483, + "loss": 0.2472, "step": 3553 }, { "epoch": 0.4861833105335157, - "grad_norm": 1.4783326200907212, + "grad_norm": 1.4622607935330754, "learning_rate": 5.216963904768485e-06, - "loss": 0.2256, + "loss": 0.2268, "step": 3554 }, { "epoch": 0.48632010943912446, - "grad_norm": 1.008266315084772, + "grad_norm": 0.9906870314878014, "learning_rate": 5.214817076620848e-06, - "loss": 0.1798, + "loss": 0.1763, "step": 3555 }, { "epoch": 0.48645690834473326, - "grad_norm": 0.9248446914665611, + "grad_norm": 0.913789584445552, "learning_rate": 5.2126702087966705e-06, "loss": 0.1268, "step": 3556 }, { "epoch": 0.486593707250342, - "grad_norm": 1.11598380657129, + "grad_norm": 1.0974602310644483, "learning_rate": 5.210523301692479e-06, - "loss": 0.1657, + "loss": 0.1638, "step": 3557 }, { "epoch": 0.48673050615595076, - "grad_norm": 1.1773412501404954, + "grad_norm": 1.211842717491577, "learning_rate": 5.208376355704806e-06, - "loss": 0.1655, + "loss": 0.17, "step": 3558 }, { "epoch": 0.4868673050615595, - "grad_norm": 1.235414337703974, + "grad_norm": 1.2261030607420735, "learning_rate": 5.206229371230188e-06, - "loss": 0.2008, + "loss": 0.2025, "step": 3559 }, { "epoch": 0.48700410396716826, - "grad_norm": 1.2647238664723275, + "grad_norm": 1.2486203837254441, "learning_rate": 5.204082348665174e-06, - "loss": 0.1624, + "loss": 0.1628, "step": 3560 }, { "epoch": 0.487140902872777, - "grad_norm": 1.1311636140990065, + "grad_norm": 1.1255788001363387, "learning_rate": 5.201935288406316e-06, - "loss": 0.2012, + "loss": 0.2013, "step": 3561 }, { "epoch": 0.48727770177838575, - "grad_norm": 1.2988859027872885, + "grad_norm": 1.2696443427462187, "learning_rate": 5.199788190850173e-06, - "loss": 0.1829, + "loss": 0.1843, "step": 3562 }, { "epoch": 0.4874145006839945, - "grad_norm": 1.1407173222286056, + "grad_norm": 1.1235710985731497, "learning_rate": 5.1976410563933145e-06, - "loss": 0.1843, + "loss": 0.1857, "step": 3563 }, { "epoch": 0.4875512995896033, - "grad_norm": 1.2505793848990945, + "grad_norm": 1.2419319146773555, "learning_rate": 5.1954938854323114e-06, - "loss": 0.2211, + "loss": 0.2203, "step": 3564 }, { "epoch": 0.48768809849521205, - "grad_norm": 1.2081821123474685, + "grad_norm": 1.1958358818590469, "learning_rate": 5.193346678363749e-06, - "loss": 0.1962, + "loss": 0.1954, "step": 3565 }, { "epoch": 0.4878248974008208, - "grad_norm": 1.177523055288356, + "grad_norm": 1.157473641636697, "learning_rate": 5.191199435584211e-06, - "loss": 0.1707, + "loss": 0.1701, "step": 3566 }, { "epoch": 0.48796169630642955, - "grad_norm": 1.3533089890451309, + "grad_norm": 1.3255540486657604, "learning_rate": 5.189052157490294e-06, - "loss": 0.2433, + "loss": 0.2418, "step": 3567 }, { "epoch": 0.4880984952120383, - "grad_norm": 1.2610158231382398, + "grad_norm": 1.2496112535837738, "learning_rate": 5.186904844478596e-06, - "loss": 0.1763, + "loss": 0.1757, "step": 3568 }, { "epoch": 0.48823529411764705, - "grad_norm": 1.263508809349497, + "grad_norm": 1.232110407670747, "learning_rate": 5.184757496945726e-06, - "loss": 0.1621, + "loss": 0.1613, "step": 3569 }, { "epoch": 0.4883720930232558, - "grad_norm": 1.2923855467210499, + "grad_norm": 1.2417958113279008, "learning_rate": 5.182610115288296e-06, - "loss": 0.2078, + "loss": 0.2035, "step": 3570 }, { "epoch": 0.48850889192886454, - "grad_norm": 1.4545850347703466, + "grad_norm": 1.4165692587849998, "learning_rate": 5.180462699902925e-06, - "loss": 0.1937, + "loss": 0.1916, "step": 3571 }, { "epoch": 0.48864569083447335, - "grad_norm": 1.3716282466557679, + "grad_norm": 1.3326226288036767, "learning_rate": 5.17831525118624e-06, - "loss": 0.2282, + "loss": 0.2243, "step": 3572 }, { "epoch": 0.4887824897400821, - "grad_norm": 1.2430882376728274, + "grad_norm": 1.2259051917092452, "learning_rate": 5.176167769534873e-06, - "loss": 0.1882, + "loss": 0.1897, "step": 3573 }, { "epoch": 0.48891928864569084, - "grad_norm": 1.3304901488774963, + "grad_norm": 1.3340641518095813, "learning_rate": 5.174020255345464e-06, - "loss": 0.1882, + "loss": 0.1899, "step": 3574 }, { "epoch": 0.4890560875512996, - "grad_norm": 1.0364727753677454, + "grad_norm": 1.0151813319181455, "learning_rate": 5.171872709014655e-06, - "loss": 0.1737, + "loss": 0.1726, "step": 3575 }, { "epoch": 0.48919288645690834, - "grad_norm": 1.3321713254542715, + "grad_norm": 1.3031927760664237, "learning_rate": 5.169725130939095e-06, - "loss": 0.155, + "loss": 0.1557, "step": 3576 }, { "epoch": 0.4893296853625171, - "grad_norm": 1.4637961907078645, + "grad_norm": 1.4448284313648794, "learning_rate": 5.167577521515442e-06, - "loss": 0.2091, + "loss": 0.2131, "step": 3577 }, { "epoch": 0.48946648426812583, - "grad_norm": 1.0468939490871325, + "grad_norm": 1.0390189864306345, "learning_rate": 5.165429881140356e-06, - "loss": 0.1753, + "loss": 0.1728, "step": 3578 }, { "epoch": 0.4896032831737346, - "grad_norm": 1.4359273173868672, + "grad_norm": 1.438207049745794, "learning_rate": 5.163282210210506e-06, - "loss": 0.2392, + "loss": 0.244, "step": 3579 }, { "epoch": 0.4897400820793434, - "grad_norm": 1.1723361301923902, + "grad_norm": 1.1558937331808175, "learning_rate": 5.161134509122564e-06, "loss": 0.1798, "step": 3580 }, { "epoch": 0.48987688098495213, - "grad_norm": 1.2871572074517776, + "grad_norm": 1.2751072815672468, "learning_rate": 5.15898677827321e-06, - "loss": 0.1951, + "loss": 0.1944, "step": 3581 }, { "epoch": 0.4900136798905609, - "grad_norm": 1.245472732747631, + "grad_norm": 1.2411073945070679, "learning_rate": 5.1568390180591265e-06, - "loss": 0.1882, + "loss": 0.1851, "step": 3582 }, { "epoch": 0.49015047879616963, - "grad_norm": 1.263781613081436, + "grad_norm": 1.2429080634924874, "learning_rate": 5.154691228877004e-06, - "loss": 0.1954, + "loss": 0.195, "step": 3583 }, { "epoch": 0.4902872777017784, - "grad_norm": 1.0899606370251131, + "grad_norm": 1.063160175181755, "learning_rate": 5.152543411123538e-06, - "loss": 0.1725, + "loss": 0.1715, "step": 3584 }, { "epoch": 0.4904240766073871, - "grad_norm": 1.3205957237021753, + "grad_norm": 1.3011758417366028, "learning_rate": 5.150395565195426e-06, - "loss": 0.182, + "loss": 0.181, "step": 3585 }, { "epoch": 0.4905608755129959, - "grad_norm": 1.2015084344075246, + "grad_norm": 1.1908619370344125, "learning_rate": 5.148247691489378e-06, - "loss": 0.2061, + "loss": 0.2024, "step": 3586 }, { "epoch": 0.4906976744186046, - "grad_norm": 1.3267883413530839, + "grad_norm": 1.3164405948409093, "learning_rate": 5.1460997904021e-06, - "loss": 0.1758, + "loss": 0.1751, "step": 3587 }, { "epoch": 0.4908344733242134, - "grad_norm": 1.389664373817528, + "grad_norm": 1.3708323319387472, "learning_rate": 5.143951862330308e-06, - "loss": 0.2186, + "loss": 0.2194, "step": 3588 }, { "epoch": 0.4909712722298222, - "grad_norm": 1.3308234052182522, + "grad_norm": 1.3159642212503115, "learning_rate": 5.141803907670727e-06, - "loss": 0.1861, + "loss": 0.188, "step": 3589 }, { "epoch": 0.4911080711354309, - "grad_norm": 1.4778510892655208, + "grad_norm": 1.4747089935246218, "learning_rate": 5.139655926820078e-06, - "loss": 0.2089, + "loss": 0.2099, "step": 3590 }, { "epoch": 0.49124487004103967, - "grad_norm": 1.4991059104037256, + "grad_norm": 1.4777543899899288, "learning_rate": 5.137507920175095e-06, - "loss": 0.232, + "loss": 0.2328, "step": 3591 }, { "epoch": 0.4913816689466484, - "grad_norm": 1.55078527121467, + "grad_norm": 1.5406044442218894, "learning_rate": 5.135359888132508e-06, - "loss": 0.2438, + "loss": 0.2437, "step": 3592 }, { "epoch": 0.49151846785225717, - "grad_norm": 1.1842425444907723, + "grad_norm": 1.1831917236420284, "learning_rate": 5.13321183108906e-06, - "loss": 0.1821, + "loss": 0.1831, "step": 3593 }, { "epoch": 0.4916552667578659, - "grad_norm": 1.4788747653540617, + "grad_norm": 1.47714186577284, "learning_rate": 5.131063749441496e-06, - "loss": 0.2292, + "loss": 0.2291, "step": 3594 }, { "epoch": 0.49179206566347466, - "grad_norm": 1.2130440974830068, + "grad_norm": 1.195184551501135, "learning_rate": 5.128915643586564e-06, - "loss": 0.181, + "loss": 0.1804, "step": 3595 }, { "epoch": 0.49192886456908347, - "grad_norm": 1.4995156776921645, + "grad_norm": 1.453029369993586, "learning_rate": 5.126767513921016e-06, - "loss": 0.1778, + "loss": 0.1739, "step": 3596 }, { "epoch": 0.4920656634746922, - "grad_norm": 1.5349237271952079, + "grad_norm": 1.511643989984655, "learning_rate": 5.124619360841613e-06, - "loss": 0.2035, + "loss": 0.2048, "step": 3597 }, { "epoch": 0.49220246238030096, - "grad_norm": 1.103886160627787, + "grad_norm": 1.1056049020420817, "learning_rate": 5.122471184745115e-06, - "loss": 0.1746, + "loss": 0.1783, "step": 3598 }, { "epoch": 0.4923392612859097, - "grad_norm": 1.132604879278037, + "grad_norm": 1.1361345936339335, "learning_rate": 5.120322986028288e-06, - "loss": 0.1517, + "loss": 0.1531, "step": 3599 }, { "epoch": 0.49247606019151846, - "grad_norm": 1.330814420317926, + "grad_norm": 1.3001360454396236, "learning_rate": 5.118174765087904e-06, - "loss": 0.1843, + "loss": 0.1812, "step": 3600 }, { "epoch": 0.49247606019151846, - "eval_loss": 0.18695440888404846, - "eval_runtime": 5.9288, - "eval_samples_per_second": 5.06, - "eval_steps_per_second": 1.349, + "eval_loss": 0.18666821718215942, + "eval_runtime": 5.9242, + "eval_samples_per_second": 5.064, + "eval_steps_per_second": 1.35, "step": 3600 }, { "epoch": 0.4926128590971272, - "grad_norm": 1.421851104633736, + "grad_norm": 1.3952178179542187, "learning_rate": 5.116026522320735e-06, - "loss": 0.2365, + "loss": 0.2347, "step": 3601 }, { "epoch": 0.49274965800273596, - "grad_norm": 1.17946552968077, + "grad_norm": 1.162175790868679, "learning_rate": 5.1138782581235634e-06, - "loss": 0.2241, + "loss": 0.2252, "step": 3602 }, { "epoch": 0.4928864569083447, - "grad_norm": 1.2667947917879625, + "grad_norm": 1.2485555727944195, "learning_rate": 5.11172997289317e-06, - "loss": 0.1691, + "loss": 0.1681, "step": 3603 }, { "epoch": 0.4930232558139535, - "grad_norm": 1.1835722039239158, + "grad_norm": 1.1535935948494522, "learning_rate": 5.109581667026341e-06, - "loss": 0.1854, + "loss": 0.1828, "step": 3604 }, { "epoch": 0.49316005471956226, - "grad_norm": 1.2696401899676812, + "grad_norm": 1.2486385376781208, "learning_rate": 5.1074333409198674e-06, - "loss": 0.2022, + "loss": 0.202, "step": 3605 }, { "epoch": 0.493296853625171, - "grad_norm": 1.282664168856483, + "grad_norm": 1.2708785895498427, "learning_rate": 5.105284994970543e-06, - "loss": 0.2061, + "loss": 0.2066, "step": 3606 }, { "epoch": 0.49343365253077975, - "grad_norm": 1.4720161197155242, + "grad_norm": 1.4271340910035382, "learning_rate": 5.103136629575165e-06, - "loss": 0.2084, + "loss": 0.2055, "step": 3607 }, { "epoch": 0.4935704514363885, - "grad_norm": 1.0844015480085893, + "grad_norm": 1.0737832693227616, "learning_rate": 5.100988245130535e-06, "loss": 0.1415, "step": 3608 }, { "epoch": 0.49370725034199725, - "grad_norm": 1.3186660261377687, + "grad_norm": 1.257666973784243, "learning_rate": 5.098839842033459e-06, - "loss": 0.1831, + "loss": 0.1756, "step": 3609 }, { "epoch": 0.493844049247606, - "grad_norm": 1.2045857322532874, + "grad_norm": 1.1960915550341467, "learning_rate": 5.096691420680745e-06, - "loss": 0.2265, + "loss": 0.226, "step": 3610 }, { "epoch": 0.49398084815321475, - "grad_norm": 1.2830191594398614, + "grad_norm": 1.2518877050676336, "learning_rate": 5.094542981469204e-06, - "loss": 0.191, + "loss": 0.1917, "step": 3611 }, { "epoch": 0.49411764705882355, - "grad_norm": 1.3077289490153048, + "grad_norm": 1.3018695290251197, "learning_rate": 5.09239452479565e-06, - "loss": 0.2028, + "loss": 0.2045, "step": 3612 }, { "epoch": 0.4942544459644323, - "grad_norm": 1.350264427856219, + "grad_norm": 1.3349430516985696, "learning_rate": 5.090246051056903e-06, - "loss": 0.2174, + "loss": 0.2148, "step": 3613 }, { "epoch": 0.49439124487004105, - "grad_norm": 1.049877379925961, + "grad_norm": 1.035781888366622, "learning_rate": 5.088097560649784e-06, - "loss": 0.1815, + "loss": 0.183, "step": 3614 }, { "epoch": 0.4945280437756498, - "grad_norm": 1.280071009382722, + "grad_norm": 1.2498114213448195, "learning_rate": 5.085949053971116e-06, - "loss": 0.177, + "loss": 0.1758, "step": 3615 }, { "epoch": 0.49466484268125854, - "grad_norm": 1.3792978453238844, + "grad_norm": 1.35654899881889, "learning_rate": 5.083800531417729e-06, - "loss": 0.2169, + "loss": 0.2153, "step": 3616 }, { "epoch": 0.4948016415868673, - "grad_norm": 1.340301277562125, + "grad_norm": 1.3276806442216817, "learning_rate": 5.081651993386452e-06, - "loss": 0.1894, + "loss": 0.1897, "step": 3617 }, { "epoch": 0.49493844049247604, - "grad_norm": 1.4404091587082999, + "grad_norm": 1.4174262877462398, "learning_rate": 5.0795034402741185e-06, - "loss": 0.2252, + "loss": 0.2228, "step": 3618 }, { "epoch": 0.4950752393980848, - "grad_norm": 1.2134194601501769, + "grad_norm": 1.2146041484362016, "learning_rate": 5.077354872477564e-06, - "loss": 0.1915, + "loss": 0.192, "step": 3619 }, { "epoch": 0.4952120383036936, - "grad_norm": 1.0621791052872671, + "grad_norm": 1.0557808586523416, "learning_rate": 5.075206290393625e-06, - "loss": 0.1357, + "loss": 0.136, "step": 3620 }, { "epoch": 0.49534883720930234, - "grad_norm": 1.1706073790893072, + "grad_norm": 1.1749606403372777, "learning_rate": 5.0730576944191465e-06, - "loss": 0.1711, + "loss": 0.1732, "step": 3621 }, { "epoch": 0.4954856361149111, - "grad_norm": 1.4412113925977348, + "grad_norm": 1.4276080812116132, "learning_rate": 5.070909084950973e-06, "loss": 0.2103, "step": 3622 }, { "epoch": 0.49562243502051984, - "grad_norm": 1.314646580345949, + "grad_norm": 1.2818268055219444, "learning_rate": 5.068760462385947e-06, - "loss": 0.1969, + "loss": 0.1952, "step": 3623 }, { "epoch": 0.4957592339261286, - "grad_norm": 1.1404686085132694, + "grad_norm": 1.1438078454669138, "learning_rate": 5.06661182712092e-06, - "loss": 0.1824, + "loss": 0.1828, "step": 3624 }, { "epoch": 0.49589603283173733, - "grad_norm": 1.3292767273078934, + "grad_norm": 1.3081417988315887, "learning_rate": 5.064463179552743e-06, - "loss": 0.2318, + "loss": 0.2284, "step": 3625 }, { "epoch": 0.4960328317373461, - "grad_norm": 1.3544864611911864, + "grad_norm": 1.3495486693923444, "learning_rate": 5.06231452007827e-06, - "loss": 0.2153, + "loss": 0.2123, "step": 3626 }, { "epoch": 0.49616963064295483, - "grad_norm": 1.2409934573148331, + "grad_norm": 1.176513847116654, "learning_rate": 5.060165849094355e-06, - "loss": 0.1882, + "loss": 0.1881, "step": 3627 }, { "epoch": 0.49630642954856363, - "grad_norm": 1.1399200657174609, + "grad_norm": 1.1333423597189967, "learning_rate": 5.058017166997855e-06, "loss": 0.1719, "step": 3628 }, { "epoch": 0.4964432284541724, - "grad_norm": 1.1272497682428089, + "grad_norm": 1.1111972694546457, "learning_rate": 5.055868474185633e-06, - "loss": 0.163, + "loss": 0.1611, "step": 3629 }, { "epoch": 0.49658002735978113, - "grad_norm": 1.1781272373498624, + "grad_norm": 1.1512102947596152, "learning_rate": 5.053719771054548e-06, - "loss": 0.173, + "loss": 0.1725, "step": 3630 }, { "epoch": 0.4967168262653899, - "grad_norm": 1.0499431231731944, + "grad_norm": 1.0345530382428711, "learning_rate": 5.051571058001466e-06, - "loss": 0.1607, + "loss": 0.1611, "step": 3631 }, { "epoch": 0.4968536251709986, - "grad_norm": 1.0832827770749553, + "grad_norm": 1.080791096647178, "learning_rate": 5.049422335423252e-06, - "loss": 0.1943, + "loss": 0.1938, "step": 3632 }, { "epoch": 0.4969904240766074, - "grad_norm": 1.169209105306984, + "grad_norm": 1.151475213088777, "learning_rate": 5.047273603716773e-06, - "loss": 0.1737, + "loss": 0.1736, "step": 3633 }, { "epoch": 0.4971272229822161, - "grad_norm": 1.5772383362180973, + "grad_norm": 1.547020256093984, "learning_rate": 5.045124863278898e-06, - "loss": 0.2395, + "loss": 0.2382, "step": 3634 }, { "epoch": 0.49726402188782487, - "grad_norm": 1.1960243101377768, + "grad_norm": 1.2802417438273095, "learning_rate": 5.042976114506496e-06, - "loss": 0.1724, + "loss": 0.1782, "step": 3635 }, { "epoch": 0.4974008207934337, - "grad_norm": 1.3886875528642846, + "grad_norm": 1.3581039923448233, "learning_rate": 5.040827357796441e-06, - "loss": 0.1966, + "loss": 0.1953, "step": 3636 }, { "epoch": 0.4975376196990424, - "grad_norm": 1.2982386263437498, + "grad_norm": 1.2868089418643232, "learning_rate": 5.038678593545606e-06, - "loss": 0.223, + "loss": 0.2232, "step": 3637 }, { "epoch": 0.49767441860465117, - "grad_norm": 1.5387916499507972, + "grad_norm": 1.488785194224549, "learning_rate": 5.036529822150865e-06, - "loss": 0.2091, + "loss": 0.2076, "step": 3638 }, { "epoch": 0.4978112175102599, - "grad_norm": 1.0823162307510288, + "grad_norm": 1.1074096125740638, "learning_rate": 5.034381044009098e-06, - "loss": 0.1844, + "loss": 0.1864, "step": 3639 }, { "epoch": 0.49794801641586867, - "grad_norm": 1.4633000402922276, + "grad_norm": 1.4375784011675319, "learning_rate": 5.032232259517179e-06, - "loss": 0.223, + "loss": 0.2215, "step": 3640 }, { "epoch": 0.4980848153214774, - "grad_norm": 1.147533469265644, + "grad_norm": 1.1516919606355054, "learning_rate": 5.030083469071988e-06, - "loss": 0.1846, + "loss": 0.1866, "step": 3641 }, { "epoch": 0.49822161422708616, - "grad_norm": 1.2911584474590077, + "grad_norm": 1.3094469410779388, "learning_rate": 5.027934673070405e-06, - "loss": 0.206, + "loss": 0.2077, "step": 3642 }, { "epoch": 0.4983584131326949, - "grad_norm": 0.986442475705516, + "grad_norm": 0.9799513280950158, "learning_rate": 5.02578587190931e-06, - "loss": 0.1627, + "loss": 0.1634, "step": 3643 }, { "epoch": 0.4984952120383037, - "grad_norm": 1.2339839500731875, + "grad_norm": 1.2151620415730495, "learning_rate": 5.023637065985585e-06, - "loss": 0.1928, + "loss": 0.1926, "step": 3644 }, { "epoch": 0.49863201094391246, - "grad_norm": 1.2831908125958773, + "grad_norm": 1.2894340885240254, "learning_rate": 5.021488255696113e-06, - "loss": 0.2156, + "loss": 0.217, "step": 3645 }, { "epoch": 0.4987688098495212, - "grad_norm": 1.2873615204656272, + "grad_norm": 1.2547102142876925, "learning_rate": 5.019339441437779e-06, - "loss": 0.1947, + "loss": 0.1945, "step": 3646 }, { "epoch": 0.49890560875512996, - "grad_norm": 1.2618481949276654, + "grad_norm": 1.2500145226454369, "learning_rate": 5.017190623607465e-06, - "loss": 0.1995, + "loss": 0.1999, "step": 3647 }, { "epoch": 0.4990424076607387, - "grad_norm": 1.2494667974890306, + "grad_norm": 1.2191695714965727, "learning_rate": 5.0150418026020575e-06, - "loss": 0.194, + "loss": 0.192, "step": 3648 }, { "epoch": 0.49917920656634746, - "grad_norm": 1.1414004468093217, + "grad_norm": 1.1267898576052913, "learning_rate": 5.012892978818441e-06, - "loss": 0.1639, + "loss": 0.1644, "step": 3649 }, { "epoch": 0.4993160054719562, - "grad_norm": 1.4168155362367785, + "grad_norm": 1.4108809965199536, "learning_rate": 5.010744152653501e-06, - "loss": 0.1983, + "loss": 0.1995, "step": 3650 }, { "epoch": 0.49945280437756495, - "grad_norm": 1.1947307045415467, + "grad_norm": 1.1898630026329626, "learning_rate": 5.008595324504125e-06, - "loss": 0.167, + "loss": 0.1672, "step": 3651 }, { "epoch": 0.49958960328317376, - "grad_norm": 1.4132868689071942, + "grad_norm": 1.4037164073775714, "learning_rate": 5.0064464947672e-06, - "loss": 0.2236, + "loss": 0.2221, "step": 3652 }, { "epoch": 0.4997264021887825, - "grad_norm": 1.193120943176155, + "grad_norm": 1.1761799600269736, "learning_rate": 5.004297663839612e-06, - "loss": 0.1615, + "loss": 0.1607, "step": 3653 }, { "epoch": 0.49986320109439125, - "grad_norm": 1.4193312126960018, + "grad_norm": 1.4048830951906421, "learning_rate": 5.002148832118251e-06, - "loss": 0.2186, + "loss": 0.2187, "step": 3654 }, { "epoch": 0.5, - "grad_norm": 1.3066938858390116, + "grad_norm": 1.28330491050822, "learning_rate": 5e-06, - "loss": 0.1834, + "loss": 0.1831, "step": 3655 }, { "epoch": 0.5001367989056088, - "grad_norm": 1.368068177996611, + "grad_norm": 1.3636168455459878, "learning_rate": 4.99785116788175e-06, - "loss": 0.212, + "loss": 0.2103, "step": 3656 }, { "epoch": 0.5002735978112175, - "grad_norm": 1.2489427471312156, + "grad_norm": 1.252310006093387, "learning_rate": 4.995702336160389e-06, - "loss": 0.2146, + "loss": 0.2152, "step": 3657 }, { "epoch": 0.5004103967168263, - "grad_norm": 1.4497008625954297, + "grad_norm": 1.4159584582268367, "learning_rate": 4.9935535052328e-06, - "loss": 0.2285, + "loss": 0.2284, "step": 3658 }, { "epoch": 0.500547195622435, - "grad_norm": 1.2981106012507473, + "grad_norm": 1.30350366142358, "learning_rate": 4.991404675495875e-06, - "loss": 0.1981, + "loss": 0.198, "step": 3659 }, { "epoch": 0.5006839945280438, - "grad_norm": 1.0240205074101827, + "grad_norm": 1.0316630644516542, "learning_rate": 4.9892558473464994e-06, - "loss": 0.1269, + "loss": 0.1297, "step": 3660 }, { "epoch": 0.5008207934336525, - "grad_norm": 1.2301728063523434, + "grad_norm": 1.2084082686413438, "learning_rate": 4.9871070211815615e-06, - "loss": 0.1615, + "loss": 0.1611, "step": 3661 }, { "epoch": 0.5009575923392613, - "grad_norm": 1.1542724183929958, + "grad_norm": 1.1471435686341942, "learning_rate": 4.984958197397944e-06, - "loss": 0.1793, + "loss": 0.1778, "step": 3662 }, { "epoch": 0.5010943912448701, - "grad_norm": 1.2416588165054208, + "grad_norm": 1.2343042841380623, "learning_rate": 4.982809376392535e-06, - "loss": 0.1812, + "loss": 0.1811, "step": 3663 }, { "epoch": 0.5012311901504788, - "grad_norm": 1.2597232627757613, + "grad_norm": 1.2588906636943604, "learning_rate": 4.980660558562222e-06, - "loss": 0.1952, + "loss": 0.1946, "step": 3664 }, { "epoch": 0.5013679890560876, - "grad_norm": 1.0366923548198446, + "grad_norm": 1.0358635661599624, "learning_rate": 4.9785117443038876e-06, - "loss": 0.1402, + "loss": 0.1418, "step": 3665 }, { "epoch": 0.5015047879616963, - "grad_norm": 1.375591906503857, + "grad_norm": 1.372587417964486, "learning_rate": 4.9763629340144156e-06, - "loss": 0.2107, + "loss": 0.2117, "step": 3666 }, { "epoch": 0.5016415868673051, - "grad_norm": 1.4159431276860033, + "grad_norm": 1.3915738567730902, "learning_rate": 4.974214128090691e-06, - "loss": 0.2081, + "loss": 0.2048, "step": 3667 }, { "epoch": 0.5017783857729138, - "grad_norm": 1.3032732554258764, + "grad_norm": 1.2765900234777574, "learning_rate": 4.9720653269295975e-06, - "loss": 0.2028, + "loss": 0.202, "step": 3668 }, { "epoch": 0.5019151846785226, - "grad_norm": 1.2100367893544093, + "grad_norm": 1.2033296869110897, "learning_rate": 4.969916530928014e-06, - "loss": 0.1848, + "loss": 0.186, "step": 3669 }, { "epoch": 0.5020519835841313, - "grad_norm": 1.0266863307584428, + "grad_norm": 1.003425687517025, "learning_rate": 4.967767740482824e-06, - "loss": 0.1633, + "loss": 0.1599, "step": 3670 }, { "epoch": 0.5021887824897401, - "grad_norm": 1.3510095586462907, + "grad_norm": 1.3314116038954487, "learning_rate": 4.965618955990904e-06, - "loss": 0.1933, + "loss": 0.1909, "step": 3671 }, { "epoch": 0.5023255813953489, - "grad_norm": 1.376217003574964, + "grad_norm": 1.3784043002263617, "learning_rate": 4.963470177849135e-06, - "loss": 0.1807, + "loss": 0.1787, "step": 3672 }, { "epoch": 0.5024623803009576, - "grad_norm": 1.3101576411863616, + "grad_norm": 1.3125884672206658, "learning_rate": 4.961321406454396e-06, - "loss": 0.1892, + "loss": 0.193, "step": 3673 }, { "epoch": 0.5025991792065664, - "grad_norm": 1.2624901319396404, + "grad_norm": 1.2305932815361516, "learning_rate": 4.959172642203561e-06, - "loss": 0.1733, + "loss": 0.1709, "step": 3674 }, { "epoch": 0.5027359781121751, - "grad_norm": 1.4298908580543637, + "grad_norm": 1.4197353238530965, "learning_rate": 4.957023885493506e-06, - "loss": 0.2118, + "loss": 0.2124, "step": 3675 }, { "epoch": 0.5028727770177839, - "grad_norm": 1.1142009114161444, + "grad_norm": 1.109797371040299, "learning_rate": 4.954875136721104e-06, "loss": 0.1689, "step": 3676 }, { "epoch": 0.5030095759233926, - "grad_norm": 1.1357343981548822, + "grad_norm": 1.1132140129600718, "learning_rate": 4.95272639628323e-06, - "loss": 0.1676, + "loss": 0.1669, "step": 3677 }, { "epoch": 0.5031463748290014, - "grad_norm": 1.1590007152377717, + "grad_norm": 1.1486180044346384, "learning_rate": 4.950577664576749e-06, - "loss": 0.184, + "loss": 0.1841, "step": 3678 }, { "epoch": 0.5032831737346102, - "grad_norm": 1.1761764134823798, + "grad_norm": 1.1471516036496863, "learning_rate": 4.948428941998534e-06, - "loss": 0.1832, + "loss": 0.1813, "step": 3679 }, { "epoch": 0.5034199726402189, - "grad_norm": 1.1972693316089995, + "grad_norm": 1.18751142007399, "learning_rate": 4.946280228945453e-06, - "loss": 0.183, + "loss": 0.1843, "step": 3680 }, { "epoch": 0.5035567715458277, - "grad_norm": 1.0315624778703139, + "grad_norm": 1.0174267935779144, "learning_rate": 4.944131525814368e-06, - "loss": 0.1523, + "loss": 0.152, "step": 3681 }, { "epoch": 0.5036935704514364, - "grad_norm": 1.1455257476521257, + "grad_norm": 1.127829686799137, "learning_rate": 4.9419828330021465e-06, - "loss": 0.1681, + "loss": 0.1677, "step": 3682 }, { "epoch": 0.5038303693570452, - "grad_norm": 1.2854543521585988, + "grad_norm": 1.2632155176998487, "learning_rate": 4.939834150905647e-06, - "loss": 0.1799, + "loss": 0.1805, "step": 3683 }, { "epoch": 0.5039671682626539, - "grad_norm": 1.779363652263885, + "grad_norm": 1.7720306649753041, "learning_rate": 4.9376854799217325e-06, - "loss": 0.2545, + "loss": 0.255, "step": 3684 }, { "epoch": 0.5041039671682627, - "grad_norm": 1.0654782183762603, + "grad_norm": 1.046346882264357, "learning_rate": 4.935536820447258e-06, - "loss": 0.1597, + "loss": 0.1595, "step": 3685 }, { "epoch": 0.5042407660738714, - "grad_norm": 1.4965599562103786, + "grad_norm": 1.4761132046327097, "learning_rate": 4.933388172879083e-06, - "loss": 0.2045, + "loss": 0.2053, "step": 3686 }, { "epoch": 0.5043775649794802, - "grad_norm": 1.0817026240482412, + "grad_norm": 1.069524419314553, "learning_rate": 4.931239537614055e-06, - "loss": 0.1648, + "loss": 0.1655, "step": 3687 }, { "epoch": 0.504514363885089, - "grad_norm": 1.4127930719970467, + "grad_norm": 1.3888606456642325, "learning_rate": 4.929090915049029e-06, - "loss": 0.1923, + "loss": 0.1918, "step": 3688 }, { "epoch": 0.5046511627906977, - "grad_norm": 1.067868529350199, + "grad_norm": 1.0516507938791335, "learning_rate": 4.926942305580854e-06, - "loss": 0.1444, + "loss": 0.145, "step": 3689 }, { "epoch": 0.5047879616963065, - "grad_norm": 1.323563737195012, + "grad_norm": 1.3155267805684032, "learning_rate": 4.9247937096063754e-06, - "loss": 0.1925, + "loss": 0.1936, "step": 3690 }, { "epoch": 0.5049247606019152, - "grad_norm": 1.1373560412217547, + "grad_norm": 1.1202538457686526, "learning_rate": 4.92264512752244e-06, - "loss": 0.1809, + "loss": 0.1803, "step": 3691 }, { "epoch": 0.505061559507524, - "grad_norm": 0.9308369312965743, + "grad_norm": 0.9446746662716277, "learning_rate": 4.920496559725884e-06, - "loss": 0.1586, + "loss": 0.1592, "step": 3692 }, { "epoch": 0.5051983584131327, - "grad_norm": 1.2045611927158186, + "grad_norm": 1.167770579878093, "learning_rate": 4.918348006613551e-06, - "loss": 0.1817, + "loss": 0.1806, "step": 3693 }, { "epoch": 0.5053351573187415, - "grad_norm": 1.3603288416904327, + "grad_norm": 1.384993619637206, "learning_rate": 4.916199468582272e-06, - "loss": 0.1972, + "loss": 0.1989, "step": 3694 }, { "epoch": 0.5054719562243503, - "grad_norm": 1.3631496694911567, + "grad_norm": 1.3493107682825807, "learning_rate": 4.914050946028884e-06, - "loss": 0.1984, + "loss": 0.1972, "step": 3695 }, { "epoch": 0.505608755129959, - "grad_norm": 1.2120740960359286, + "grad_norm": 1.1973352593240985, "learning_rate": 4.911902439350217e-06, - "loss": 0.1828, + "loss": 0.1804, "step": 3696 }, { "epoch": 0.5057455540355678, - "grad_norm": 1.312194536657953, + "grad_norm": 1.3010041122040326, "learning_rate": 4.909753948943097e-06, - "loss": 0.2215, + "loss": 0.2222, "step": 3697 }, { "epoch": 0.5058823529411764, - "grad_norm": 1.3216150001740767, + "grad_norm": 1.3092465664526056, "learning_rate": 4.907605475204352e-06, - "loss": 0.2069, + "loss": 0.2074, "step": 3698 }, { "epoch": 0.5060191518467853, - "grad_norm": 1.1943706885177634, + "grad_norm": 1.18065569208671, "learning_rate": 4.905457018530798e-06, - "loss": 0.1706, + "loss": 0.1705, "step": 3699 }, { "epoch": 0.506155950752394, - "grad_norm": 1.133115227931803, + "grad_norm": 1.1512307570878413, "learning_rate": 4.903308579319258e-06, - "loss": 0.1601, + "loss": 0.1645, "step": 3700 }, { "epoch": 0.506155950752394, - "eval_loss": 0.1868218332529068, - "eval_runtime": 5.892, - "eval_samples_per_second": 5.092, - "eval_steps_per_second": 1.358, + "eval_loss": 0.18661421537399292, + "eval_runtime": 5.9157, + "eval_samples_per_second": 5.071, + "eval_steps_per_second": 1.352, "step": 3700 }, { "epoch": 0.5062927496580027, - "grad_norm": 1.0810221533866073, + "grad_norm": 1.075015131313996, "learning_rate": 4.901160157966542e-06, - "loss": 0.1426, + "loss": 0.1432, "step": 3701 }, { "epoch": 0.5064295485636114, - "grad_norm": 1.5800257700314777, + "grad_norm": 1.5576683613439206, "learning_rate": 4.899011754869466e-06, - "loss": 0.1952, + "loss": 0.1933, "step": 3702 }, { "epoch": 0.5065663474692202, - "grad_norm": 1.3647508138343503, + "grad_norm": 1.3361196512827878, "learning_rate": 4.896863370424836e-06, "loss": 0.22, "step": 3703 }, { "epoch": 0.506703146374829, - "grad_norm": 1.349186476587248, + "grad_norm": 1.3368766630819244, "learning_rate": 4.894715005029459e-06, - "loss": 0.2154, + "loss": 0.2161, "step": 3704 }, { "epoch": 0.5068399452804377, - "grad_norm": 1.3912638310794796, + "grad_norm": 1.390554531176618, "learning_rate": 4.892566659080134e-06, - "loss": 0.2589, + "loss": 0.2626, "step": 3705 }, { "epoch": 0.5069767441860465, - "grad_norm": 2.1207911337338987, + "grad_norm": 1.516876970242321, "learning_rate": 4.89041833297366e-06, - "loss": 0.2789, + "loss": 0.2722, "step": 3706 }, { "epoch": 0.5071135430916552, - "grad_norm": 1.1583633299388587, + "grad_norm": 1.1539821337831235, "learning_rate": 4.888270027106832e-06, - "loss": 0.1881, + "loss": 0.1886, "step": 3707 }, { "epoch": 0.507250341997264, - "grad_norm": 1.353992218866261, + "grad_norm": 1.3351670996965634, "learning_rate": 4.886121741876438e-06, - "loss": 0.1919, + "loss": 0.1883, "step": 3708 }, { "epoch": 0.5073871409028727, - "grad_norm": 1.2739907965662574, + "grad_norm": 1.2566910404218126, "learning_rate": 4.883973477679266e-06, - "loss": 0.1951, + "loss": 0.1959, "step": 3709 }, { "epoch": 0.5075239398084815, - "grad_norm": 1.1994326425047153, + "grad_norm": 1.2056013751442032, "learning_rate": 4.881825234912099e-06, - "loss": 0.1938, + "loss": 0.1957, "step": 3710 }, { "epoch": 0.5076607387140903, - "grad_norm": 1.2248717657990131, + "grad_norm": 1.2163100531365532, "learning_rate": 4.879677013971713e-06, - "loss": 0.1919, + "loss": 0.1927, "step": 3711 }, { "epoch": 0.507797537619699, - "grad_norm": 1.0729480353052678, + "grad_norm": 1.063269286188066, "learning_rate": 4.877528815254887e-06, - "loss": 0.176, + "loss": 0.1748, "step": 3712 }, { "epoch": 0.5079343365253078, - "grad_norm": 1.149277340958679, + "grad_norm": 1.1198853088483056, "learning_rate": 4.875380639158387e-06, - "loss": 0.1633, + "loss": 0.1627, "step": 3713 }, { "epoch": 0.5080711354309165, - "grad_norm": 1.0962488586080226, + "grad_norm": 1.0901958445070816, "learning_rate": 4.8732324860789855e-06, - "loss": 0.177, + "loss": 0.1774, "step": 3714 }, { "epoch": 0.5082079343365253, - "grad_norm": 1.364395611270344, + "grad_norm": 1.3208009388318995, "learning_rate": 4.871084356413438e-06, - "loss": 0.2097, + "loss": 0.1985, "step": 3715 }, { "epoch": 0.508344733242134, - "grad_norm": 1.327346059301053, + "grad_norm": 1.299130410129059, "learning_rate": 4.8689362505585065e-06, - "loss": 0.1923, + "loss": 0.1955, "step": 3716 }, { "epoch": 0.5084815321477428, - "grad_norm": 1.0691468783073033, + "grad_norm": 1.0442544387443202, "learning_rate": 4.866788168910942e-06, - "loss": 0.1486, + "loss": 0.1479, "step": 3717 }, { "epoch": 0.5086183310533515, - "grad_norm": 1.4165200901670265, + "grad_norm": 1.3911393598877664, "learning_rate": 4.864640111867494e-06, - "loss": 0.2214, + "loss": 0.2206, "step": 3718 }, { "epoch": 0.5087551299589603, - "grad_norm": 0.9653914652765522, + "grad_norm": 0.9440891821815581, "learning_rate": 4.862492079824908e-06, - "loss": 0.1543, + "loss": 0.1528, "step": 3719 }, { "epoch": 0.5088919288645691, - "grad_norm": 1.1884911691304818, + "grad_norm": 1.1667195377459922, "learning_rate": 4.860344073179922e-06, - "loss": 0.1705, + "loss": 0.1701, "step": 3720 }, { "epoch": 0.5090287277701778, - "grad_norm": 1.1419611132011254, + "grad_norm": 1.1329048282798015, "learning_rate": 4.858196092329274e-06, - "loss": 0.1766, + "loss": 0.1757, "step": 3721 }, { "epoch": 0.5091655266757866, - "grad_norm": 1.3587160587605016, + "grad_norm": 1.3265033069441656, "learning_rate": 4.856048137669691e-06, - "loss": 0.2071, + "loss": 0.2058, "step": 3722 }, { "epoch": 0.5093023255813953, - "grad_norm": 1.0944867256247368, + "grad_norm": 1.0800635723997882, "learning_rate": 4.853900209597902e-06, - "loss": 0.1649, + "loss": 0.1656, "step": 3723 }, { "epoch": 0.5094391244870041, - "grad_norm": 1.254409606216617, + "grad_norm": 1.242934839833238, "learning_rate": 4.851752308510624e-06, - "loss": 0.1887, + "loss": 0.1879, "step": 3724 }, { "epoch": 0.5095759233926128, - "grad_norm": 1.1706516352191942, + "grad_norm": 1.147875094142848, "learning_rate": 4.849604434804575e-06, - "loss": 0.155, + "loss": 0.1539, "step": 3725 }, { "epoch": 0.5097127222982216, - "grad_norm": 1.2788605308191292, + "grad_norm": 1.2413518078043235, "learning_rate": 4.847456588876463e-06, - "loss": 0.1789, + "loss": 0.1778, "step": 3726 }, { "epoch": 0.5098495212038304, - "grad_norm": 1.2990054758601506, + "grad_norm": 1.279205730265485, "learning_rate": 4.845308771122996e-06, - "loss": 0.2134, + "loss": 0.2132, "step": 3727 }, { "epoch": 0.5099863201094391, - "grad_norm": 1.2797945933192785, + "grad_norm": 1.2567657141083375, "learning_rate": 4.843160981940875e-06, - "loss": 0.1982, + "loss": 0.1963, "step": 3728 }, { "epoch": 0.5101231190150479, - "grad_norm": 1.351354643950698, + "grad_norm": 1.3325681778626555, "learning_rate": 4.841013221726791e-06, - "loss": 0.1956, + "loss": 0.1962, "step": 3729 }, { "epoch": 0.5102599179206566, - "grad_norm": 0.9854317221717903, + "grad_norm": 0.9479288803022057, "learning_rate": 4.8388654908774375e-06, - "loss": 0.1387, + "loss": 0.1373, "step": 3730 }, { "epoch": 0.5103967168262654, - "grad_norm": 1.2271582417223899, + "grad_norm": 1.2046848187470982, "learning_rate": 4.8367177897894955e-06, - "loss": 0.1677, + "loss": 0.1645, "step": 3731 }, { "epoch": 0.5105335157318741, - "grad_norm": 1.4716857458612846, + "grad_norm": 1.4578126252092527, "learning_rate": 4.834570118859646e-06, - "loss": 0.2051, + "loss": 0.2055, "step": 3732 }, { "epoch": 0.5106703146374829, - "grad_norm": 1.3404112882873749, + "grad_norm": 1.3259209351067036, "learning_rate": 4.832422478484559e-06, - "loss": 0.2024, + "loss": 0.2029, "step": 3733 }, { "epoch": 0.5108071135430916, - "grad_norm": 1.6088594827351033, + "grad_norm": 1.568068297876996, "learning_rate": 4.830274869060907e-06, - "loss": 0.2416, + "loss": 0.2398, "step": 3734 }, { "epoch": 0.5109439124487004, - "grad_norm": 0.9821561101569357, + "grad_norm": 0.9548789359733137, "learning_rate": 4.828127290985347e-06, - "loss": 0.1787, + "loss": 0.1752, "step": 3735 }, { "epoch": 0.5110807113543092, - "grad_norm": 1.1478838741549333, + "grad_norm": 1.1496523621635253, "learning_rate": 4.8259797446545365e-06, - "loss": 0.1885, + "loss": 0.1898, "step": 3736 }, { "epoch": 0.5112175102599179, - "grad_norm": 1.5392130121777468, + "grad_norm": 1.5103706368212388, "learning_rate": 4.8238322304651275e-06, - "loss": 0.2386, + "loss": 0.2379, "step": 3737 }, { "epoch": 0.5113543091655267, - "grad_norm": 1.112909502210732, + "grad_norm": 1.1025922030041617, "learning_rate": 4.821684748813761e-06, - "loss": 0.1719, + "loss": 0.173, "step": 3738 }, { "epoch": 0.5114911080711354, - "grad_norm": 1.122618570140562, + "grad_norm": 1.116496582525125, "learning_rate": 4.819537300097077e-06, - "loss": 0.1456, + "loss": 0.1465, "step": 3739 }, { "epoch": 0.5116279069767442, - "grad_norm": 1.2023591906616773, + "grad_norm": 1.1921530998313998, "learning_rate": 4.817389884711706e-06, "loss": 0.1831, "step": 3740 }, { "epoch": 0.5117647058823529, - "grad_norm": 1.1130796380924237, + "grad_norm": 1.0871192833701282, "learning_rate": 4.815242503054277e-06, - "loss": 0.1684, + "loss": 0.1671, "step": 3741 }, { "epoch": 0.5119015047879617, - "grad_norm": 1.3915105848062586, + "grad_norm": 1.3688090061034137, "learning_rate": 4.813095155521405e-06, - "loss": 0.2365, + "loss": 0.2366, "step": 3742 }, { "epoch": 0.5120383036935705, - "grad_norm": 1.0442092841354433, + "grad_norm": 1.1343140810560681, "learning_rate": 4.810947842509707e-06, - "loss": 0.1627, + "loss": 0.1626, "step": 3743 }, { "epoch": 0.5121751025991792, - "grad_norm": 1.3199432185707385, + "grad_norm": 1.3056261780284395, "learning_rate": 4.80880056441579e-06, - "loss": 0.202, + "loss": 0.2005, "step": 3744 }, { "epoch": 0.512311901504788, - "grad_norm": 1.4490311305690347, + "grad_norm": 1.4215615979029974, "learning_rate": 4.806653321636252e-06, - "loss": 0.234, + "loss": 0.236, "step": 3745 }, { "epoch": 0.5124487004103967, - "grad_norm": 1.2155330061328669, + "grad_norm": 1.1830961490434377, "learning_rate": 4.804506114567689e-06, - "loss": 0.185, + "loss": 0.1846, "step": 3746 }, { "epoch": 0.5125854993160055, - "grad_norm": 1.1997336073477245, + "grad_norm": 1.1996831379651456, "learning_rate": 4.802358943606687e-06, - "loss": 0.1887, + "loss": 0.1881, "step": 3747 }, { "epoch": 0.5127222982216142, - "grad_norm": 1.5518562748873472, + "grad_norm": 1.5230105601351709, "learning_rate": 4.800211809149829e-06, - "loss": 0.1822, + "loss": 0.1808, "step": 3748 }, { "epoch": 0.512859097127223, - "grad_norm": 1.2074475580580686, + "grad_norm": 1.18208726553718, "learning_rate": 4.798064711593686e-06, - "loss": 0.171, + "loss": 0.1695, "step": 3749 }, { "epoch": 0.5129958960328317, - "grad_norm": 1.0668120717061775, + "grad_norm": 1.040855105232108, "learning_rate": 4.795917651334829e-06, - "loss": 0.1793, + "loss": 0.1785, "step": 3750 }, { "epoch": 0.5131326949384405, - "grad_norm": 1.2511988970450298, + "grad_norm": 1.252147422035739, "learning_rate": 4.793770628769813e-06, - "loss": 0.2004, + "loss": 0.2012, "step": 3751 }, { "epoch": 0.5132694938440493, - "grad_norm": 1.079858376986981, + "grad_norm": 1.0692885472583777, "learning_rate": 4.791623644295195e-06, - "loss": 0.1722, + "loss": 0.1708, "step": 3752 }, { "epoch": 0.513406292749658, - "grad_norm": 1.5899811421627594, + "grad_norm": 1.5594530350478213, "learning_rate": 4.7894766983075225e-06, - "loss": 0.2124, + "loss": 0.2105, "step": 3753 }, { "epoch": 0.5135430916552668, - "grad_norm": 1.0735889164313077, + "grad_norm": 1.0648663763902972, "learning_rate": 4.78732979120333e-06, - "loss": 0.1571, + "loss": 0.1559, "step": 3754 }, { "epoch": 0.5136798905608755, - "grad_norm": 1.4044222773298671, + "grad_norm": 1.3821614006538545, "learning_rate": 4.785182923379154e-06, - "loss": 0.1861, + "loss": 0.1859, "step": 3755 }, { "epoch": 0.5138166894664843, - "grad_norm": 1.3855877040721867, + "grad_norm": 1.4475937322192576, "learning_rate": 4.783036095231516e-06, - "loss": 0.1853, + "loss": 0.1857, "step": 3756 }, { "epoch": 0.513953488372093, - "grad_norm": 1.2334736490229865, + "grad_norm": 1.223368687456814, "learning_rate": 4.780889307156938e-06, - "loss": 0.1885, + "loss": 0.1898, "step": 3757 }, { "epoch": 0.5140902872777018, - "grad_norm": 1.4412288898248657, + "grad_norm": 1.429950016842877, "learning_rate": 4.7787425595519245e-06, - "loss": 0.1989, + "loss": 0.1975, "step": 3758 }, { "epoch": 0.5142270861833106, - "grad_norm": 1.192011581885965, + "grad_norm": 1.180381436018353, "learning_rate": 4.77659585281298e-06, - "loss": 0.1801, + "loss": 0.1812, "step": 3759 }, { "epoch": 0.5143638850889193, - "grad_norm": 1.2846202607790072, + "grad_norm": 1.236040020127443, "learning_rate": 4.774449187336603e-06, - "loss": 0.2087, + "loss": 0.2068, "step": 3760 }, { "epoch": 0.5145006839945281, - "grad_norm": 1.570666012816418, + "grad_norm": 1.542030283307262, "learning_rate": 4.772302563519274e-06, - "loss": 0.2309, + "loss": 0.2292, "step": 3761 }, { "epoch": 0.5146374829001368, - "grad_norm": 1.29742952179723, + "grad_norm": 1.2879456221773629, "learning_rate": 4.770155981757479e-06, - "loss": 0.2088, + "loss": 0.2084, "step": 3762 }, { "epoch": 0.5147742818057456, - "grad_norm": 1.0593505448249108, + "grad_norm": 1.038625553784798, "learning_rate": 4.768009442447685e-06, - "loss": 0.1696, + "loss": 0.1687, "step": 3763 }, { "epoch": 0.5149110807113543, - "grad_norm": 1.0868149216767595, + "grad_norm": 1.066641621874555, "learning_rate": 4.765862945986362e-06, - "loss": 0.1667, + "loss": 0.166, "step": 3764 }, { "epoch": 0.5150478796169631, - "grad_norm": 1.2771843516228374, + "grad_norm": 1.2461934338774516, "learning_rate": 4.76371649276996e-06, - "loss": 0.1833, + "loss": 0.1814, "step": 3765 }, { "epoch": 0.5151846785225718, - "grad_norm": 1.3029834181612743, + "grad_norm": 1.291558357909403, "learning_rate": 4.761570083194932e-06, - "loss": 0.1798, + "loss": 0.1796, "step": 3766 }, { "epoch": 0.5153214774281806, - "grad_norm": 1.3153590186818471, + "grad_norm": 1.294736066145996, "learning_rate": 4.759423717657713e-06, - "loss": 0.1859, + "loss": 0.184, "step": 3767 }, { "epoch": 0.5154582763337894, - "grad_norm": 1.1069778787028797, + "grad_norm": 1.073335235735289, "learning_rate": 4.757277396554739e-06, - "loss": 0.1525, + "loss": 0.1517, "step": 3768 }, { "epoch": 0.5155950752393981, - "grad_norm": 1.3478017336616892, + "grad_norm": 1.3124700572525652, "learning_rate": 4.755131120282433e-06, - "loss": 0.1898, + "loss": 0.1902, "step": 3769 }, { "epoch": 0.5157318741450069, - "grad_norm": 1.235405556815796, + "grad_norm": 1.2186486680186284, "learning_rate": 4.752984889237209e-06, - "loss": 0.1774, + "loss": 0.1771, "step": 3770 }, { "epoch": 0.5158686730506156, - "grad_norm": 1.1332854238200725, + "grad_norm": 1.1370820310489682, "learning_rate": 4.750838703815477e-06, - "loss": 0.1984, + "loss": 0.2003, "step": 3771 }, { "epoch": 0.5160054719562244, - "grad_norm": 1.3404244362149453, + "grad_norm": 1.3007387884004418, "learning_rate": 4.748692564413633e-06, - "loss": 0.1939, + "loss": 0.1923, "step": 3772 }, { "epoch": 0.5161422708618331, - "grad_norm": 1.2539830539755794, + "grad_norm": 1.2542936110168579, "learning_rate": 4.746546471428069e-06, - "loss": 0.1715, + "loss": 0.171, "step": 3773 }, { "epoch": 0.5162790697674419, - "grad_norm": 1.363677935506472, + "grad_norm": 1.35645218065452, "learning_rate": 4.744400425255165e-06, - "loss": 0.1906, + "loss": 0.1911, "step": 3774 }, { "epoch": 0.5164158686730507, - "grad_norm": 1.385272677355931, + "grad_norm": 1.3628177595051258, "learning_rate": 4.742254426291294e-06, - "loss": 0.2166, + "loss": 0.2145, "step": 3775 }, { "epoch": 0.5165526675786594, - "grad_norm": 1.2223903537090057, + "grad_norm": 1.2023755254612083, "learning_rate": 4.740108474932822e-06, - "loss": 0.1815, + "loss": 0.1825, "step": 3776 }, { "epoch": 0.5166894664842682, - "grad_norm": 1.1875019716892445, + "grad_norm": 1.1615083388795244, "learning_rate": 4.737962571576104e-06, - "loss": 0.1624, + "loss": 0.1617, "step": 3777 }, { "epoch": 0.5168262653898769, - "grad_norm": 1.091923437442557, + "grad_norm": 1.0730269616193298, "learning_rate": 4.735816716617486e-06, - "loss": 0.1852, + "loss": 0.1832, "step": 3778 }, { "epoch": 0.5169630642954857, - "grad_norm": 1.5765408415337911, + "grad_norm": 1.547364209051581, "learning_rate": 4.733670910453304e-06, - "loss": 0.2306, + "loss": 0.2282, "step": 3779 }, { "epoch": 0.5170998632010944, - "grad_norm": 1.1092161453432376, + "grad_norm": 1.0954273630183176, "learning_rate": 4.731525153479891e-06, - "loss": 0.1828, + "loss": 0.1824, "step": 3780 }, { "epoch": 0.5172366621067032, - "grad_norm": 1.1877191902963402, + "grad_norm": 1.1718640820337893, "learning_rate": 4.729379446093562e-06, - "loss": 0.1819, + "loss": 0.1845, "step": 3781 }, { "epoch": 0.5173734610123119, - "grad_norm": 0.9162297890760606, + "grad_norm": 0.8907432268256701, "learning_rate": 4.727233788690631e-06, - "loss": 0.1367, + "loss": 0.1354, "step": 3782 }, { "epoch": 0.5175102599179207, - "grad_norm": 1.2982265826867347, + "grad_norm": 1.2559841239474383, "learning_rate": 4.725088181667397e-06, - "loss": 0.2065, + "loss": 0.2077, "step": 3783 }, { "epoch": 0.5176470588235295, - "grad_norm": 1.30240312028828, + "grad_norm": 1.2779913552535138, "learning_rate": 4.7229426254201504e-06, - "loss": 0.1866, + "loss": 0.1847, "step": 3784 }, { "epoch": 0.5177838577291382, - "grad_norm": 1.356012001867754, + "grad_norm": 1.338939425552428, "learning_rate": 4.720797120345178e-06, - "loss": 0.2071, + "loss": 0.2061, "step": 3785 }, { "epoch": 0.517920656634747, - "grad_norm": 0.906254482640454, + "grad_norm": 0.9002364635474122, "learning_rate": 4.71865166683875e-06, - "loss": 0.15, + "loss": 0.1497, "step": 3786 }, { "epoch": 0.5180574555403556, - "grad_norm": 1.342626944104748, + "grad_norm": 1.320957500344943, "learning_rate": 4.716506265297131e-06, - "loss": 0.1878, + "loss": 0.1866, "step": 3787 }, { "epoch": 0.5181942544459645, - "grad_norm": 1.337612593383367, + "grad_norm": 1.3276828422065763, "learning_rate": 4.714360916116574e-06, - "loss": 0.2008, + "loss": 0.2018, "step": 3788 }, { "epoch": 0.5183310533515731, - "grad_norm": 1.0644775559812814, + "grad_norm": 1.0777987578846033, "learning_rate": 4.712215619693325e-06, - "loss": 0.1636, + "loss": 0.166, "step": 3789 }, { "epoch": 0.518467852257182, - "grad_norm": 1.1487920290380909, + "grad_norm": 1.1213695149716638, "learning_rate": 4.710070376423615e-06, - "loss": 0.1695, + "loss": 0.1698, "step": 3790 }, { "epoch": 0.5186046511627908, - "grad_norm": 1.0404679962652543, + "grad_norm": 1.0277835738369174, "learning_rate": 4.707925186703671e-06, - "loss": 0.1862, + "loss": 0.1868, "step": 3791 }, { "epoch": 0.5187414500683994, - "grad_norm": 1.1770144024116842, + "grad_norm": 1.1646573956666335, "learning_rate": 4.705780050929709e-06, - "loss": 0.1638, + "loss": 0.1639, "step": 3792 }, { "epoch": 0.5188782489740082, - "grad_norm": 1.0756639687910674, + "grad_norm": 1.0649195267024134, "learning_rate": 4.70363496949793e-06, - "loss": 0.1796, + "loss": 0.1771, "step": 3793 }, { "epoch": 0.5190150478796169, - "grad_norm": 1.2874034249911934, + "grad_norm": 1.2780303179136043, "learning_rate": 4.7014899428045356e-06, - "loss": 0.1878, + "loss": 0.1887, "step": 3794 }, { "epoch": 0.5191518467852257, - "grad_norm": 1.5120181635607042, + "grad_norm": 1.4787362142619447, "learning_rate": 4.699344971245703e-06, - "loss": 0.1884, + "loss": 0.1877, "step": 3795 }, { "epoch": 0.5192886456908344, - "grad_norm": 1.3044831214463941, + "grad_norm": 1.2715817505301534, "learning_rate": 4.6972000552176125e-06, - "loss": 0.2021, + "loss": 0.201, "step": 3796 }, { "epoch": 0.5194254445964432, - "grad_norm": 1.1238632551199867, + "grad_norm": 1.1195375927977727, "learning_rate": 4.695055195116423e-06, - "loss": 0.1748, + "loss": 0.1746, "step": 3797 }, { "epoch": 0.5195622435020519, - "grad_norm": 1.1883804171688253, + "grad_norm": 1.1664883576621758, "learning_rate": 4.692910391338292e-06, - "loss": 0.1827, + "loss": 0.1839, "step": 3798 }, { "epoch": 0.5196990424076607, - "grad_norm": 1.3947273494320245, + "grad_norm": 1.359040495300216, "learning_rate": 4.690765644279362e-06, - "loss": 0.1819, + "loss": 0.1791, "step": 3799 }, { "epoch": 0.5198358413132695, - "grad_norm": 1.159118618160745, + "grad_norm": 1.1353919601642681, "learning_rate": 4.688620954335766e-06, - "loss": 0.2017, + "loss": 0.2001, "step": 3800 }, { "epoch": 0.5198358413132695, - "eval_loss": 0.18603971600532532, - "eval_runtime": 5.9275, - "eval_samples_per_second": 5.061, - "eval_steps_per_second": 1.35, + "eval_loss": 0.18592852354049683, + "eval_runtime": 5.9432, + "eval_samples_per_second": 5.048, + "eval_steps_per_second": 1.346, "step": 3800 }, { "epoch": 0.5199726402188782, - "grad_norm": 1.3458314871390133, + "grad_norm": 1.2881476798074347, "learning_rate": 4.686476321903629e-06, - "loss": 0.2015, + "loss": 0.1986, "step": 3801 }, { "epoch": 0.520109439124487, - "grad_norm": 1.3129673920962228, + "grad_norm": 1.2770663297598763, "learning_rate": 4.684331747379058e-06, - "loss": 0.2034, + "loss": 0.1987, "step": 3802 }, { "epoch": 0.5202462380300957, - "grad_norm": 1.0905638058722371, + "grad_norm": 1.096836753173724, "learning_rate": 4.682187231158159e-06, - "loss": 0.1563, + "loss": 0.1574, "step": 3803 }, { "epoch": 0.5203830369357045, - "grad_norm": 1.3196714407644445, + "grad_norm": 1.3089522122556794, "learning_rate": 4.680042773637018e-06, - "loss": 0.1883, + "loss": 0.1868, "step": 3804 }, { "epoch": 0.5205198358413132, - "grad_norm": 1.2819623567085163, + "grad_norm": 1.254232411825048, "learning_rate": 4.677898375211718e-06, - "loss": 0.1834, + "loss": 0.1837, "step": 3805 }, { "epoch": 0.520656634746922, - "grad_norm": 1.4087296633269328, + "grad_norm": 1.3868811302303516, "learning_rate": 4.675754036278326e-06, - "loss": 0.198, + "loss": 0.1972, "step": 3806 }, { "epoch": 0.5207934336525308, - "grad_norm": 0.9567883849302762, + "grad_norm": 0.9433540489727005, "learning_rate": 4.6736097572328995e-06, - "loss": 0.1327, + "loss": 0.1318, "step": 3807 }, { "epoch": 0.5209302325581395, - "grad_norm": 1.2463303065812372, + "grad_norm": 1.223407232761153, "learning_rate": 4.671465538471487e-06, - "loss": 0.1936, + "loss": 0.1942, "step": 3808 }, { "epoch": 0.5210670314637483, - "grad_norm": 1.132145884788208, + "grad_norm": 1.1069560096914817, "learning_rate": 4.669321380390121e-06, - "loss": 0.1991, + "loss": 0.2011, "step": 3809 }, { "epoch": 0.521203830369357, - "grad_norm": 1.116328525035906, + "grad_norm": 1.1116733855524776, "learning_rate": 4.667177283384829e-06, - "loss": 0.1943, + "loss": 0.1934, "step": 3810 }, { "epoch": 0.5213406292749658, - "grad_norm": 1.229850197445949, + "grad_norm": 1.211332168160401, "learning_rate": 4.66503324785162e-06, - "loss": 0.1882, + "loss": 0.1908, "step": 3811 }, { "epoch": 0.5214774281805745, - "grad_norm": 1.4039123199641654, + "grad_norm": 1.3732576692423608, "learning_rate": 4.6628892741865e-06, - "loss": 0.212, + "loss": 0.2099, "step": 3812 }, { "epoch": 0.5216142270861833, - "grad_norm": 1.6883206116750353, + "grad_norm": 1.6885571307188825, "learning_rate": 4.660745362785456e-06, - "loss": 0.1972, + "loss": 0.1973, "step": 3813 }, { "epoch": 0.521751025991792, - "grad_norm": 1.375330886613214, + "grad_norm": 1.3667906898222335, "learning_rate": 4.658601514044469e-06, - "loss": 0.2006, + "loss": 0.2019, "step": 3814 }, { "epoch": 0.5218878248974008, - "grad_norm": 1.1622573253064894, + "grad_norm": 1.1662756928234825, "learning_rate": 4.656457728359504e-06, - "loss": 0.196, + "loss": 0.1968, "step": 3815 }, { "epoch": 0.5220246238030096, - "grad_norm": 1.346791339087593, + "grad_norm": 1.3318408109782311, "learning_rate": 4.654314006126516e-06, - "loss": 0.1985, + "loss": 0.1987, "step": 3816 }, { "epoch": 0.5221614227086183, - "grad_norm": 1.3051636678463057, + "grad_norm": 1.298259536024214, "learning_rate": 4.652170347741454e-06, "loss": 0.1735, "step": 3817 }, { "epoch": 0.5222982216142271, - "grad_norm": 1.2929225222619456, + "grad_norm": 1.2706281481790458, "learning_rate": 4.6500267536002434e-06, - "loss": 0.1872, + "loss": 0.188, "step": 3818 }, { "epoch": 0.5224350205198358, - "grad_norm": 1.2373141368608855, + "grad_norm": 1.2245568353195602, "learning_rate": 4.647883224098808e-06, - "loss": 0.1771, + "loss": 0.1761, "step": 3819 }, { "epoch": 0.5225718194254446, - "grad_norm": 1.2984147016177123, + "grad_norm": 1.2663183859067664, "learning_rate": 4.645739759633054e-06, - "loss": 0.1723, + "loss": 0.1709, "step": 3820 }, { "epoch": 0.5227086183310533, - "grad_norm": 1.2467589122696168, + "grad_norm": 1.2160829769598425, "learning_rate": 4.64359636059888e-06, - "loss": 0.1929, + "loss": 0.1922, "step": 3821 }, { "epoch": 0.5228454172366621, - "grad_norm": 1.2325252386471732, + "grad_norm": 1.2206766274057823, "learning_rate": 4.6414530273921664e-06, - "loss": 0.2085, + "loss": 0.2114, "step": 3822 }, { "epoch": 0.5229822161422709, - "grad_norm": 1.2232456910328857, + "grad_norm": 1.1744551996459371, "learning_rate": 4.639309760408788e-06, - "loss": 0.1975, + "loss": 0.1952, "step": 3823 }, { "epoch": 0.5231190150478796, - "grad_norm": 1.085472660198324, + "grad_norm": 1.0692966954168655, "learning_rate": 4.637166560044606e-06, - "loss": 0.1774, + "loss": 0.1753, "step": 3824 }, { "epoch": 0.5232558139534884, - "grad_norm": 1.2536657135976397, + "grad_norm": 1.2332795009826432, "learning_rate": 4.635023426695462e-06, - "loss": 0.1862, + "loss": 0.1866, "step": 3825 }, { "epoch": 0.5233926128590971, - "grad_norm": 1.289096437221674, + "grad_norm": 1.2674337675347678, "learning_rate": 4.632880360757198e-06, - "loss": 0.165, + "loss": 0.1642, "step": 3826 }, { "epoch": 0.5235294117647059, - "grad_norm": 1.2352878236200806, + "grad_norm": 1.2348535369586637, "learning_rate": 4.630737362625631e-06, - "loss": 0.204, + "loss": 0.2047, "step": 3827 }, { "epoch": 0.5236662106703146, - "grad_norm": 1.1965301541867976, + "grad_norm": 1.209456656578987, "learning_rate": 4.628594432696574e-06, - "loss": 0.1923, + "loss": 0.1922, "step": 3828 }, { "epoch": 0.5238030095759234, - "grad_norm": 1.6042306327314009, + "grad_norm": 1.5831265386921, "learning_rate": 4.626451571365822e-06, - "loss": 0.224, + "loss": 0.2247, "step": 3829 }, { "epoch": 0.5239398084815321, - "grad_norm": 1.0962052649684064, + "grad_norm": 1.088506183632758, "learning_rate": 4.624308779029164e-06, - "loss": 0.1787, + "loss": 0.1797, "step": 3830 }, { "epoch": 0.5240766073871409, - "grad_norm": 1.4800995338380227, + "grad_norm": 1.472858140536681, "learning_rate": 4.622166056082366e-06, - "loss": 0.168, + "loss": 0.1687, "step": 3831 }, { "epoch": 0.5242134062927497, - "grad_norm": 1.491825165321968, + "grad_norm": 1.4681691717094, "learning_rate": 4.620023402921191e-06, - "loss": 0.2286, + "loss": 0.228, "step": 3832 }, { "epoch": 0.5243502051983584, - "grad_norm": 1.07594128547538, + "grad_norm": 1.0676453441244982, "learning_rate": 4.617880819941387e-06, - "loss": 0.1665, + "loss": 0.1667, "step": 3833 }, { "epoch": 0.5244870041039672, - "grad_norm": 1.1626104162888646, + "grad_norm": 1.150721663763788, "learning_rate": 4.615738307538683e-06, "loss": 0.1845, "step": 3834 }, { "epoch": 0.5246238030095759, - "grad_norm": 1.3197418739941686, + "grad_norm": 1.2965949551467744, "learning_rate": 4.6135958661088024e-06, - "loss": 0.1929, + "loss": 0.1925, "step": 3835 }, { "epoch": 0.5247606019151847, - "grad_norm": 1.2287634313025662, + "grad_norm": 1.2132809527221984, "learning_rate": 4.61145349604745e-06, - "loss": 0.1881, + "loss": 0.1875, "step": 3836 }, { "epoch": 0.5248974008207934, - "grad_norm": 1.607010779933376, + "grad_norm": 1.594722592624889, "learning_rate": 4.609311197750324e-06, - "loss": 0.269, + "loss": 0.2719, "step": 3837 }, { "epoch": 0.5250341997264022, - "grad_norm": 1.2063005584359234, + "grad_norm": 1.2078140339551515, "learning_rate": 4.607168971613099e-06, - "loss": 0.1986, + "loss": 0.2007, "step": 3838 }, { "epoch": 0.525170998632011, - "grad_norm": 1.16128441488409, + "grad_norm": 1.1486152690467664, "learning_rate": 4.6050268180314466e-06, - "loss": 0.1876, + "loss": 0.1873, "step": 3839 }, { "epoch": 0.5253077975376197, - "grad_norm": 1.032900831772625, + "grad_norm": 1.0234382712693813, "learning_rate": 4.602884737401022e-06, - "loss": 0.1575, + "loss": 0.157, "step": 3840 }, { "epoch": 0.5254445964432285, - "grad_norm": 1.226733507048913, + "grad_norm": 1.21263641739131, "learning_rate": 4.600742730117461e-06, - "loss": 0.1769, + "loss": 0.1773, "step": 3841 }, { "epoch": 0.5255813953488372, - "grad_norm": 1.197475325275394, + "grad_norm": 1.1799009423517195, "learning_rate": 4.598600796576395e-06, - "loss": 0.1633, + "loss": 0.163, "step": 3842 }, { "epoch": 0.525718194254446, - "grad_norm": 1.3233837646175568, + "grad_norm": 1.300822247839317, "learning_rate": 4.596458937173435e-06, - "loss": 0.2121, + "loss": 0.2112, "step": 3843 }, { "epoch": 0.5258549931600547, - "grad_norm": 1.2162249602550872, + "grad_norm": 1.1888906980920382, "learning_rate": 4.594317152304183e-06, - "loss": 0.1792, + "loss": 0.1783, "step": 3844 }, { "epoch": 0.5259917920656635, - "grad_norm": 1.3877276307818205, + "grad_norm": 1.3647305888695425, "learning_rate": 4.59217544236422e-06, - "loss": 0.1905, + "loss": 0.1881, "step": 3845 }, { "epoch": 0.5261285909712722, - "grad_norm": 1.42167680523332, + "grad_norm": 1.3888780665396803, "learning_rate": 4.590033807749126e-06, - "loss": 0.1892, + "loss": 0.1888, "step": 3846 }, { "epoch": 0.526265389876881, - "grad_norm": 1.1989971777478339, + "grad_norm": 1.185531005218427, "learning_rate": 4.587892248854451e-06, - "loss": 0.1862, + "loss": 0.1868, "step": 3847 }, { "epoch": 0.5264021887824898, - "grad_norm": 1.1929921467953104, + "grad_norm": 1.180756014240003, "learning_rate": 4.585750766075743e-06, - "loss": 0.1663, + "loss": 0.1641, "step": 3848 }, { "epoch": 0.5265389876880985, - "grad_norm": 1.1559888493221722, + "grad_norm": 1.1177887722444269, "learning_rate": 4.583609359808534e-06, - "loss": 0.1508, + "loss": 0.1491, "step": 3849 }, { "epoch": 0.5266757865937073, - "grad_norm": 1.1755862534346286, + "grad_norm": 1.1734363771986633, "learning_rate": 4.581468030448336e-06, - "loss": 0.1753, + "loss": 0.177, "step": 3850 }, { "epoch": 0.526812585499316, - "grad_norm": 1.2445007539667006, + "grad_norm": 1.2321086454518277, "learning_rate": 4.579326778390653e-06, - "loss": 0.1929, + "loss": 0.1947, "step": 3851 }, { "epoch": 0.5269493844049248, - "grad_norm": 1.0401349605403132, + "grad_norm": 1.049545235182388, "learning_rate": 4.577185604030972e-06, - "loss": 0.1809, + "loss": 0.1807, "step": 3852 }, { "epoch": 0.5270861833105335, - "grad_norm": 1.1178302936454365, + "grad_norm": 1.1038384594764938, "learning_rate": 4.575044507764768e-06, - "loss": 0.1854, + "loss": 0.1848, "step": 3853 }, { "epoch": 0.5272229822161423, - "grad_norm": 1.1649368699601246, + "grad_norm": 1.1458847857360723, "learning_rate": 4.572903489987496e-06, - "loss": 0.1631, + "loss": 0.1633, "step": 3854 }, { "epoch": 0.5273597811217511, - "grad_norm": 1.252960322243594, + "grad_norm": 1.223413727342276, "learning_rate": 4.570762551094602e-06, - "loss": 0.1752, + "loss": 0.1765, "step": 3855 }, { "epoch": 0.5274965800273598, - "grad_norm": 1.1194411492549494, + "grad_norm": 1.1113477177664204, "learning_rate": 4.568621691481519e-06, - "loss": 0.2008, + "loss": 0.2006, "step": 3856 }, { "epoch": 0.5276333789329686, - "grad_norm": 1.2898525707790627, + "grad_norm": 1.269272711241423, "learning_rate": 4.566480911543657e-06, - "loss": 0.212, + "loss": 0.2129, "step": 3857 }, { "epoch": 0.5277701778385773, - "grad_norm": 1.1341028569106932, + "grad_norm": 1.1145813704611276, "learning_rate": 4.564340211676419e-06, - "loss": 0.1622, + "loss": 0.1608, "step": 3858 }, { "epoch": 0.5279069767441861, - "grad_norm": 1.5106038377076052, + "grad_norm": 1.521227958010964, "learning_rate": 4.562199592275189e-06, - "loss": 0.2209, + "loss": 0.2229, "step": 3859 }, { "epoch": 0.5280437756497948, - "grad_norm": 1.3910063751381285, + "grad_norm": 1.3783213967315904, "learning_rate": 4.56005905373534e-06, "loss": 0.1857, "step": 3860 }, { "epoch": 0.5281805745554036, - "grad_norm": 1.275891810724756, + "grad_norm": 1.2550861867841205, "learning_rate": 4.557918596452225e-06, - "loss": 0.2172, + "loss": 0.2195, "step": 3861 }, { "epoch": 0.5283173734610123, - "grad_norm": 1.1856169952306546, + "grad_norm": 1.1490137328253822, "learning_rate": 4.555778220821187e-06, - "loss": 0.166, + "loss": 0.1634, "step": 3862 }, { "epoch": 0.5284541723666211, - "grad_norm": 1.3711812509217935, + "grad_norm": 1.3397651777787978, "learning_rate": 4.5536379272375495e-06, - "loss": 0.2187, + "loss": 0.2188, "step": 3863 }, { "epoch": 0.5285909712722299, - "grad_norm": 0.9029085877301404, + "grad_norm": 0.8935567589243638, "learning_rate": 4.551497716096625e-06, - "loss": 0.1628, + "loss": 0.1623, "step": 3864 }, { "epoch": 0.5287277701778386, - "grad_norm": 1.1551479430439622, + "grad_norm": 1.1418752406039452, "learning_rate": 4.5493575877937076e-06, - "loss": 0.1698, + "loss": 0.1684, "step": 3865 }, { "epoch": 0.5288645690834474, - "grad_norm": 1.4571137600558255, + "grad_norm": 1.4287558631207704, "learning_rate": 4.5472175427240765e-06, - "loss": 0.2172, + "loss": 0.2164, "step": 3866 }, { "epoch": 0.5290013679890561, - "grad_norm": 1.1515330560325936, + "grad_norm": 1.1423655041647716, "learning_rate": 4.545077581283e-06, - "loss": 0.1656, + "loss": 0.1645, "step": 3867 }, { "epoch": 0.5291381668946649, - "grad_norm": 1.4017910247349845, + "grad_norm": 1.3918187808151252, "learning_rate": 4.542937703865722e-06, "loss": 0.2196, "step": 3868 }, { "epoch": 0.5292749658002736, - "grad_norm": 1.335502974869491, + "grad_norm": 1.3112694624542267, "learning_rate": 4.540797910867481e-06, - "loss": 0.2023, + "loss": 0.2012, "step": 3869 }, { "epoch": 0.5294117647058824, - "grad_norm": 1.4476441959007436, + "grad_norm": 1.4347957097678992, "learning_rate": 4.53865820268349e-06, - "loss": 0.2277, + "loss": 0.2274, "step": 3870 }, { "epoch": 0.5295485636114912, - "grad_norm": 1.266191392585534, + "grad_norm": 1.2462983546999695, "learning_rate": 4.536518579708956e-06, - "loss": 0.1744, + "loss": 0.1752, "step": 3871 }, { "epoch": 0.5296853625170999, - "grad_norm": 1.1596138585240812, + "grad_norm": 1.1232157355930337, "learning_rate": 4.534379042339063e-06, - "loss": 0.1717, + "loss": 0.1694, "step": 3872 }, { "epoch": 0.5298221614227087, - "grad_norm": 1.3913523212957588, + "grad_norm": 1.3755061650451001, "learning_rate": 4.532239590968982e-06, - "loss": 0.2637, + "loss": 0.2642, "step": 3873 }, { "epoch": 0.5299589603283174, - "grad_norm": 1.3052625037804015, + "grad_norm": 1.305768588103665, "learning_rate": 4.530100225993871e-06, - "loss": 0.205, + "loss": 0.2049, "step": 3874 }, { "epoch": 0.5300957592339262, - "grad_norm": 1.0271781312207549, + "grad_norm": 1.01740930159327, "learning_rate": 4.5279609478088635e-06, - "loss": 0.1758, + "loss": 0.1761, "step": 3875 }, { "epoch": 0.5302325581395348, - "grad_norm": 1.1711548712258444, + "grad_norm": 1.1522999748095322, "learning_rate": 4.525821756809088e-06, - "loss": 0.1839, + "loss": 0.1833, "step": 3876 }, { "epoch": 0.5303693570451437, - "grad_norm": 1.3710937716225369, + "grad_norm": 1.3587548746411464, "learning_rate": 4.523682653389646e-06, - "loss": 0.2077, + "loss": 0.2094, "step": 3877 }, { "epoch": 0.5305061559507523, - "grad_norm": 1.0396066408728681, + "grad_norm": 1.0179147889937583, "learning_rate": 4.521543637945633e-06, - "loss": 0.151, + "loss": 0.1493, "step": 3878 }, { "epoch": 0.5306429548563611, - "grad_norm": 1.234245298138106, + "grad_norm": 1.2088325923795138, "learning_rate": 4.519404710872119e-06, - "loss": 0.1993, + "loss": 0.1985, "step": 3879 }, { "epoch": 0.53077975376197, - "grad_norm": 1.4964367129072766, + "grad_norm": 1.4743680414311597, "learning_rate": 4.517265872564167e-06, - "loss": 0.2033, + "loss": 0.2043, "step": 3880 }, { "epoch": 0.5309165526675786, - "grad_norm": 1.2809882539404618, + "grad_norm": 1.2646458433214696, "learning_rate": 4.515127123416815e-06, - "loss": 0.1811, + "loss": 0.1821, "step": 3881 }, { "epoch": 0.5310533515731874, - "grad_norm": 1.1421523462324552, + "grad_norm": 1.137412440114781, "learning_rate": 4.512988463825088e-06, - "loss": 0.1696, + "loss": 0.1683, "step": 3882 }, { "epoch": 0.5311901504787961, - "grad_norm": 1.0233075370519638, + "grad_norm": 1.016893122055673, "learning_rate": 4.510849894183999e-06, - "loss": 0.1659, + "loss": 0.1651, "step": 3883 }, { "epoch": 0.531326949384405, - "grad_norm": 1.3254194890958468, + "grad_norm": 1.291890815849425, "learning_rate": 4.508711414888534e-06, - "loss": 0.1933, + "loss": 0.1903, "step": 3884 }, { "epoch": 0.5314637482900136, - "grad_norm": 1.0974088852872883, + "grad_norm": 1.0904360786626546, "learning_rate": 4.506573026333673e-06, - "loss": 0.1923, + "loss": 0.1936, "step": 3885 }, { "epoch": 0.5316005471956224, - "grad_norm": 1.039411498566426, + "grad_norm": 1.0230212737377147, "learning_rate": 4.504434728914374e-06, - "loss": 0.1455, + "loss": 0.1459, "step": 3886 }, { "epoch": 0.5317373461012312, - "grad_norm": 1.0810095369015182, + "grad_norm": 1.0716472988230705, "learning_rate": 4.502296523025576e-06, - "loss": 0.1536, + "loss": 0.1541, "step": 3887 }, { "epoch": 0.5318741450068399, - "grad_norm": 1.2611050988639159, + "grad_norm": 1.2794461828754358, "learning_rate": 4.500158409062206e-06, - "loss": 0.1956, + "loss": 0.1952, "step": 3888 }, { "epoch": 0.5320109439124487, - "grad_norm": 1.1164754837246158, + "grad_norm": 1.1080891965994601, "learning_rate": 4.4980203874191715e-06, - "loss": 0.1633, + "loss": 0.1627, "step": 3889 }, { "epoch": 0.5321477428180574, - "grad_norm": 1.3739391965650947, + "grad_norm": 1.3992672212410762, "learning_rate": 4.495882458491366e-06, - "loss": 0.2551, + "loss": 0.2557, "step": 3890 }, { "epoch": 0.5322845417236662, - "grad_norm": 1.2647813659258347, + "grad_norm": 1.2403692429513486, "learning_rate": 4.493744622673658e-06, - "loss": 0.1783, + "loss": 0.1781, "step": 3891 }, { "epoch": 0.5324213406292749, - "grad_norm": 1.1005455142253926, + "grad_norm": 1.0851420171810953, "learning_rate": 4.49160688036091e-06, - "loss": 0.1585, + "loss": 0.1582, "step": 3892 }, { "epoch": 0.5325581395348837, - "grad_norm": 1.1404692721616114, + "grad_norm": 1.1290629248589947, "learning_rate": 4.489469231947954e-06, - "loss": 0.1928, + "loss": 0.1932, "step": 3893 }, { "epoch": 0.5326949384404924, - "grad_norm": 1.2700698222992324, + "grad_norm": 1.2567759866951917, "learning_rate": 4.487331677829619e-06, - "loss": 0.2521, + "loss": 0.2522, "step": 3894 }, { "epoch": 0.5328317373461012, - "grad_norm": 1.4792354956684375, + "grad_norm": 1.419941176667613, "learning_rate": 4.485194218400704e-06, - "loss": 0.2239, + "loss": 0.2183, "step": 3895 }, { "epoch": 0.53296853625171, - "grad_norm": 0.8770259541578412, + "grad_norm": 0.8614932233046729, "learning_rate": 4.483056854055999e-06, - "loss": 0.1559, + "loss": 0.1542, "step": 3896 }, { "epoch": 0.5331053351573187, - "grad_norm": 1.4716122347640532, + "grad_norm": 1.4258363300311625, "learning_rate": 4.480919585190276e-06, - "loss": 0.2056, + "loss": 0.2072, "step": 3897 }, { "epoch": 0.5332421340629275, - "grad_norm": 1.4559759948205944, + "grad_norm": 1.6928836864351693, "learning_rate": 4.478782412198281e-06, - "loss": 0.1929, + "loss": 0.1911, "step": 3898 }, { "epoch": 0.5333789329685362, - "grad_norm": 1.4445141828457548, + "grad_norm": 1.4408249163999693, "learning_rate": 4.476645335474753e-06, - "loss": 0.2029, + "loss": 0.2016, "step": 3899 }, { "epoch": 0.533515731874145, - "grad_norm": 1.2586695230118266, + "grad_norm": 1.238707613531341, "learning_rate": 4.474508355414404e-06, - "loss": 0.1762, + "loss": 0.1751, "step": 3900 }, { "epoch": 0.533515731874145, - "eval_loss": 0.184299498796463, - "eval_runtime": 5.9169, - "eval_samples_per_second": 5.07, - "eval_steps_per_second": 1.352, + "eval_loss": 0.18414843082427979, + "eval_runtime": 5.9317, + "eval_samples_per_second": 5.058, + "eval_steps_per_second": 1.349, "step": 3900 }, { "epoch": 0.5336525307797537, - "grad_norm": 1.4090698671224082, + "grad_norm": 1.364035770419156, "learning_rate": 4.472371472411936e-06, - "loss": 0.2213, + "loss": 0.2199, "step": 3901 }, { "epoch": 0.5337893296853625, - "grad_norm": 1.2134500531901264, + "grad_norm": 1.2123028163394218, "learning_rate": 4.470234686862027e-06, - "loss": 0.1625, + "loss": 0.1641, "step": 3902 }, { "epoch": 0.5339261285909713, - "grad_norm": 1.6828298043401404, + "grad_norm": 1.6461906368783046, "learning_rate": 4.468097999159342e-06, - "loss": 0.2618, + "loss": 0.2545, "step": 3903 }, { "epoch": 0.53406292749658, - "grad_norm": 1.3938776251091358, + "grad_norm": 1.394893453065794, "learning_rate": 4.465961409698525e-06, - "loss": 0.1688, + "loss": 0.1692, "step": 3904 }, { "epoch": 0.5341997264021888, - "grad_norm": 1.265058191787809, + "grad_norm": 1.256486554631366, "learning_rate": 4.4638249188742e-06, - "loss": 0.192, + "loss": 0.1916, "step": 3905 }, { "epoch": 0.5343365253077975, - "grad_norm": 1.2481427666468061, + "grad_norm": 1.2315709059921784, "learning_rate": 4.461688527080979e-06, - "loss": 0.1746, + "loss": 0.1749, "step": 3906 }, { "epoch": 0.5344733242134063, - "grad_norm": 1.5141695706809057, + "grad_norm": 1.461983213437246, "learning_rate": 4.459552234713448e-06, - "loss": 0.193, + "loss": 0.1893, "step": 3907 }, { "epoch": 0.534610123119015, - "grad_norm": 1.2541276442638398, + "grad_norm": 1.224215651861629, "learning_rate": 4.457416042166181e-06, - "loss": 0.1827, + "loss": 0.1813, "step": 3908 }, { "epoch": 0.5347469220246238, - "grad_norm": 1.5926697373271408, + "grad_norm": 1.5839477229829888, "learning_rate": 4.455279949833728e-06, - "loss": 0.251, + "loss": 0.254, "step": 3909 }, { "epoch": 0.5348837209302325, - "grad_norm": 1.3876803418491257, + "grad_norm": 1.4158473960144395, "learning_rate": 4.4531439581106295e-06, - "loss": 0.1832, + "loss": 0.1872, "step": 3910 }, { "epoch": 0.5350205198358413, - "grad_norm": 1.2650775758639523, + "grad_norm": 1.2172465644936254, "learning_rate": 4.451008067391394e-06, - "loss": 0.1539, + "loss": 0.153, "step": 3911 }, { "epoch": 0.5351573187414501, - "grad_norm": 1.232560826948839, + "grad_norm": 1.2201020012998798, "learning_rate": 4.4488722780705234e-06, - "loss": 0.2033, + "loss": 0.2012, "step": 3912 }, { "epoch": 0.5352941176470588, - "grad_norm": 1.4299582535982855, + "grad_norm": 1.2845092603619597, "learning_rate": 4.446736590542497e-06, "loss": 0.1902, "step": 3913 }, { "epoch": 0.5354309165526676, - "grad_norm": 1.3146051468613553, + "grad_norm": 1.3239550740929924, "learning_rate": 4.44460100520177e-06, - "loss": 0.2136, + "loss": 0.2156, "step": 3914 }, { "epoch": 0.5355677154582763, - "grad_norm": 1.2146741952437534, + "grad_norm": 1.2154279885329988, "learning_rate": 4.4424655224427885e-06, - "loss": 0.1498, + "loss": 0.1505, "step": 3915 }, { "epoch": 0.5357045143638851, - "grad_norm": 1.3523453595250567, + "grad_norm": 1.3411277500073988, "learning_rate": 4.440330142659971e-06, - "loss": 0.2268, + "loss": 0.2254, "step": 3916 }, { "epoch": 0.5358413132694938, - "grad_norm": 1.111553834035572, + "grad_norm": 1.0831872437122323, "learning_rate": 4.438194866247721e-06, - "loss": 0.1677, + "loss": 0.1672, "step": 3917 }, { "epoch": 0.5359781121751026, - "grad_norm": 1.066894461448029, + "grad_norm": 1.0877473898601937, "learning_rate": 4.436059693600422e-06, - "loss": 0.1805, + "loss": 0.1832, "step": 3918 }, { "epoch": 0.5361149110807114, - "grad_norm": 1.2288572803651803, + "grad_norm": 1.2272625280051415, "learning_rate": 4.433924625112439e-06, - "loss": 0.2062, + "loss": 0.2082, "step": 3919 }, { "epoch": 0.5362517099863201, - "grad_norm": 1.3416817605894475, + "grad_norm": 1.3411645820667895, "learning_rate": 4.431789661178121e-06, - "loss": 0.2022, + "loss": 0.2038, "step": 3920 }, { "epoch": 0.5363885088919289, - "grad_norm": 1.344211839243376, + "grad_norm": 1.31448326625053, "learning_rate": 4.429654802191788e-06, - "loss": 0.1969, + "loss": 0.1974, "step": 3921 }, { "epoch": 0.5365253077975376, - "grad_norm": 1.201243885286118, + "grad_norm": 1.2158006136462005, "learning_rate": 4.427520048547752e-06, - "loss": 0.1935, + "loss": 0.1961, "step": 3922 }, { "epoch": 0.5366621067031464, - "grad_norm": 1.3839682583217625, + "grad_norm": 1.3792055000116845, "learning_rate": 4.4253854006402955e-06, - "loss": 0.2123, + "loss": 0.2114, "step": 3923 }, { "epoch": 0.5367989056087551, - "grad_norm": 1.318083923179962, + "grad_norm": 1.2890740600094672, "learning_rate": 4.423250858863689e-06, - "loss": 0.1818, + "loss": 0.1814, "step": 3924 }, { "epoch": 0.5369357045143639, - "grad_norm": 1.0400370864854445, + "grad_norm": 1.0486188832562857, "learning_rate": 4.42111642361218e-06, - "loss": 0.164, + "loss": 0.1647, "step": 3925 }, { "epoch": 0.5370725034199726, - "grad_norm": 1.3481709943301068, + "grad_norm": 1.347091763658604, "learning_rate": 4.418982095279999e-06, - "loss": 0.2197, + "loss": 0.2187, "step": 3926 }, { "epoch": 0.5372093023255814, - "grad_norm": 1.3186700946186798, + "grad_norm": 1.312965743112258, "learning_rate": 4.416847874261351e-06, - "loss": 0.2042, + "loss": 0.205, "step": 3927 }, { "epoch": 0.5373461012311902, - "grad_norm": 1.3164306096996525, + "grad_norm": 1.3040001891548663, "learning_rate": 4.414713760950426e-06, - "loss": 0.1717, + "loss": 0.1719, "step": 3928 }, { "epoch": 0.5374829001367989, - "grad_norm": 1.185762377828394, + "grad_norm": 1.1638883538364353, "learning_rate": 4.412579755741397e-06, - "loss": 0.1803, + "loss": 0.1797, "step": 3929 }, { "epoch": 0.5376196990424077, - "grad_norm": 0.9985827208555803, + "grad_norm": 0.9793397565164144, "learning_rate": 4.4104458590284075e-06, - "loss": 0.1706, + "loss": 0.1701, "step": 3930 }, { "epoch": 0.5377564979480164, - "grad_norm": 1.3326122510365934, + "grad_norm": 1.3250642819554672, "learning_rate": 4.408312071205589e-06, - "loss": 0.2017, + "loss": 0.2038, "step": 3931 }, { "epoch": 0.5378932968536252, - "grad_norm": 1.258933101380238, + "grad_norm": 1.2746078906369285, "learning_rate": 4.40617839266705e-06, - "loss": 0.2045, + "loss": 0.2072, "step": 3932 }, { "epoch": 0.5380300957592339, - "grad_norm": 1.5349970739331449, + "grad_norm": 1.5183477570544774, "learning_rate": 4.40404482380688e-06, - "loss": 0.2283, + "loss": 0.2298, "step": 3933 }, { "epoch": 0.5381668946648427, - "grad_norm": 1.3989756261273176, + "grad_norm": 1.367383200989468, "learning_rate": 4.401911365019146e-06, - "loss": 0.232, + "loss": 0.2334, "step": 3934 }, { "epoch": 0.5383036935704515, - "grad_norm": 1.3892027095474266, + "grad_norm": 1.3711085609603433, "learning_rate": 4.399778016697897e-06, - "loss": 0.2017, + "loss": 0.2034, "step": 3935 }, { "epoch": 0.5384404924760602, - "grad_norm": 1.3721783194243151, + "grad_norm": 1.3523362265203678, "learning_rate": 4.397644779237162e-06, - "loss": 0.2055, + "loss": 0.207, "step": 3936 }, { "epoch": 0.538577291381669, - "grad_norm": 1.191497593716763, + "grad_norm": 1.1771376446491726, "learning_rate": 4.395511653030946e-06, - "loss": 0.191, + "loss": 0.1886, "step": 3937 }, { "epoch": 0.5387140902872777, - "grad_norm": 1.2954151807307164, + "grad_norm": 1.2979952282349556, "learning_rate": 4.393378638473236e-06, - "loss": 0.2213, + "loss": 0.221, "step": 3938 }, { "epoch": 0.5388508891928865, - "grad_norm": 1.0541848547503485, + "grad_norm": 1.0548921823944186, "learning_rate": 4.391245735957998e-06, - "loss": 0.1799, + "loss": 0.1787, "step": 3939 }, { "epoch": 0.5389876880984952, - "grad_norm": 1.107501170559365, + "grad_norm": 1.0805553373201926, "learning_rate": 4.38911294587918e-06, - "loss": 0.1527, + "loss": 0.1518, "step": 3940 }, { "epoch": 0.539124487004104, - "grad_norm": 1.3622160160321877, + "grad_norm": 1.3409895054748686, "learning_rate": 4.386980268630703e-06, - "loss": 0.2112, + "loss": 0.2115, "step": 3941 }, { "epoch": 0.5392612859097127, - "grad_norm": 1.1701868680766068, + "grad_norm": 1.1702340676416798, "learning_rate": 4.384847704606473e-06, - "loss": 0.1661, + "loss": 0.165, "step": 3942 }, { "epoch": 0.5393980848153215, - "grad_norm": 1.0924715420602094, + "grad_norm": 1.084271141701759, "learning_rate": 4.382715254200372e-06, - "loss": 0.1555, + "loss": 0.156, "step": 3943 }, { "epoch": 0.5395348837209303, - "grad_norm": 1.1209231564055924, + "grad_norm": 1.120525850940953, "learning_rate": 4.38058291780626e-06, - "loss": 0.148, + "loss": 0.1485, "step": 3944 }, { "epoch": 0.539671682626539, - "grad_norm": 1.2160090698592203, + "grad_norm": 1.2247845022379324, "learning_rate": 4.3784506958179815e-06, - "loss": 0.1796, + "loss": 0.1808, "step": 3945 }, { "epoch": 0.5398084815321478, - "grad_norm": 1.4090885217982303, + "grad_norm": 1.4050974289729399, "learning_rate": 4.376318588629354e-06, - "loss": 0.2132, + "loss": 0.2148, "step": 3946 }, { "epoch": 0.5399452804377565, - "grad_norm": 1.2150406063880697, + "grad_norm": 1.1979240471491306, "learning_rate": 4.374186596634178e-06, - "loss": 0.1697, + "loss": 0.1706, "step": 3947 }, { "epoch": 0.5400820793433653, - "grad_norm": 1.2642215401685881, + "grad_norm": 1.2706448783470905, "learning_rate": 4.372054720226227e-06, - "loss": 0.2034, + "loss": 0.2038, "step": 3948 }, { "epoch": 0.540218878248974, - "grad_norm": 1.3848023109799146, + "grad_norm": 1.3757820693355678, "learning_rate": 4.3699229597992605e-06, - "loss": 0.2027, + "loss": 0.2051, "step": 3949 }, { "epoch": 0.5403556771545828, - "grad_norm": 1.3008645978144118, + "grad_norm": 1.2772588599892778, "learning_rate": 4.36779131574701e-06, - "loss": 0.2079, + "loss": 0.2068, "step": 3950 }, { "epoch": 0.5404924760601916, - "grad_norm": 1.4040863558292462, + "grad_norm": 1.381077570756958, "learning_rate": 4.36565978846319e-06, - "loss": 0.2336, + "loss": 0.2322, "step": 3951 }, { "epoch": 0.5406292749658003, - "grad_norm": 1.394725049429095, + "grad_norm": 1.381024485760428, "learning_rate": 4.363528378341492e-06, - "loss": 0.2087, + "loss": 0.2104, "step": 3952 }, { "epoch": 0.5407660738714091, - "grad_norm": 1.7157604697701554, + "grad_norm": 1.6877540643355555, "learning_rate": 4.3613970857755855e-06, - "loss": 0.2496, + "loss": 0.2487, "step": 3953 }, { "epoch": 0.5409028727770178, - "grad_norm": 1.3385859143005343, + "grad_norm": 1.3091042491015692, "learning_rate": 4.359265911159118e-06, - "loss": 0.1758, + "loss": 0.1752, "step": 3954 }, { "epoch": 0.5410396716826266, - "grad_norm": 1.2946578197901755, + "grad_norm": 1.3025583489500556, "learning_rate": 4.357134854885716e-06, - "loss": 0.1673, + "loss": 0.1706, "step": 3955 }, { "epoch": 0.5411764705882353, - "grad_norm": 1.4303068714406784, + "grad_norm": 1.3594437857679451, "learning_rate": 4.355003917348985e-06, - "loss": 0.2271, + "loss": 0.2253, "step": 3956 }, { "epoch": 0.5413132694938441, - "grad_norm": 1.3166759002461472, + "grad_norm": 1.2868976575060802, "learning_rate": 4.352873098942505e-06, - "loss": 0.1618, + "loss": 0.1621, "step": 3957 }, { "epoch": 0.5414500683994528, - "grad_norm": 1.2904744860024167, + "grad_norm": 1.3095493258146744, "learning_rate": 4.350742400059839e-06, - "loss": 0.1961, + "loss": 0.1997, "step": 3958 }, { "epoch": 0.5415868673050616, - "grad_norm": 1.2401030948033616, + "grad_norm": 1.2024690852056548, "learning_rate": 4.348611821094523e-06, - "loss": 0.1772, + "loss": 0.1765, "step": 3959 }, { "epoch": 0.5417236662106704, - "grad_norm": 1.5572177235164015, + "grad_norm": 1.5215961410679157, "learning_rate": 4.346481362440074e-06, - "loss": 0.2363, + "loss": 0.237, "step": 3960 }, { "epoch": 0.541860465116279, - "grad_norm": 1.171077003468375, + "grad_norm": 1.1785365095395524, "learning_rate": 4.344351024489987e-06, - "loss": 0.1757, + "loss": 0.1794, "step": 3961 }, { "epoch": 0.5419972640218879, - "grad_norm": 1.389968195327231, + "grad_norm": 1.3683283595086433, "learning_rate": 4.342220807637732e-06, - "loss": 0.1762, + "loss": 0.1773, "step": 3962 }, { "epoch": 0.5421340629274966, - "grad_norm": 1.0615980582753943, + "grad_norm": 1.0380170108668325, "learning_rate": 4.3400907122767624e-06, - "loss": 0.166, + "loss": 0.1639, "step": 3963 }, { "epoch": 0.5422708618331054, - "grad_norm": 1.1551433737921506, + "grad_norm": 1.1367446739153366, "learning_rate": 4.337960738800498e-06, - "loss": 0.2017, + "loss": 0.2015, "step": 3964 }, { "epoch": 0.542407660738714, - "grad_norm": 1.3374508345496723, + "grad_norm": 1.29843178004877, "learning_rate": 4.335830887602351e-06, - "loss": 0.1825, + "loss": 0.1821, "step": 3965 }, { "epoch": 0.5425444596443229, - "grad_norm": 1.47796658464868, + "grad_norm": 1.4440997227960901, "learning_rate": 4.333701159075697e-06, - "loss": 0.2361, + "loss": 0.235, "step": 3966 }, { "epoch": 0.5426812585499317, - "grad_norm": 1.3084924091303007, + "grad_norm": 1.2769769100758288, "learning_rate": 4.331571553613897e-06, - "loss": 0.1966, + "loss": 0.197, "step": 3967 }, { "epoch": 0.5428180574555403, - "grad_norm": 1.3034373775082047, + "grad_norm": 1.2873562987611802, "learning_rate": 4.32944207161029e-06, - "loss": 0.1775, + "loss": 0.1782, "step": 3968 }, { "epoch": 0.5429548563611492, - "grad_norm": 1.1060775423906075, + "grad_norm": 1.096117211303721, "learning_rate": 4.327312713458185e-06, - "loss": 0.1614, + "loss": 0.1617, "step": 3969 }, { "epoch": 0.5430916552667578, - "grad_norm": 1.008493893651826, + "grad_norm": 1.0015428161782722, "learning_rate": 4.3251834795508785e-06, - "loss": 0.1492, + "loss": 0.149, "step": 3970 }, { "epoch": 0.5432284541723666, - "grad_norm": 1.5492031633322256, + "grad_norm": 1.4140620971329116, "learning_rate": 4.323054370281632e-06, - "loss": 0.2249, + "loss": 0.2247, "step": 3971 }, { "epoch": 0.5433652530779753, - "grad_norm": 1.4098354135091626, + "grad_norm": 1.3779126432516964, "learning_rate": 4.320925386043696e-06, - "loss": 0.2352, + "loss": 0.2342, "step": 3972 }, { "epoch": 0.5435020519835841, - "grad_norm": 1.1411772483048035, + "grad_norm": 1.117264203029011, "learning_rate": 4.318796527230287e-06, "loss": 0.1709, "step": 3973 }, { "epoch": 0.5436388508891928, - "grad_norm": 1.1345490757176984, + "grad_norm": 1.1123165418085166, "learning_rate": 4.316667794234607e-06, - "loss": 0.1554, + "loss": 0.156, "step": 3974 }, { "epoch": 0.5437756497948016, - "grad_norm": 1.127501050710348, + "grad_norm": 1.1121266500775893, "learning_rate": 4.314539187449829e-06, - "loss": 0.1744, + "loss": 0.1746, "step": 3975 }, { "epoch": 0.5439124487004104, - "grad_norm": 1.3062134236528886, + "grad_norm": 1.3038957313225394, "learning_rate": 4.312410707269105e-06, - "loss": 0.2087, + "loss": 0.2098, "step": 3976 }, { "epoch": 0.5440492476060191, - "grad_norm": 1.0769111727330387, + "grad_norm": 1.073837518329216, "learning_rate": 4.310282354085568e-06, - "loss": 0.1697, + "loss": 0.1711, "step": 3977 }, { "epoch": 0.5441860465116279, - "grad_norm": 1.0379635567719963, + "grad_norm": 1.03918835212276, "learning_rate": 4.308154128292318e-06, - "loss": 0.1743, + "loss": 0.1749, "step": 3978 }, { "epoch": 0.5443228454172366, - "grad_norm": 1.6116348795543622, + "grad_norm": 1.5934536659027858, "learning_rate": 4.306026030282441e-06, - "loss": 0.1898, + "loss": 0.1908, "step": 3979 }, { "epoch": 0.5444596443228454, - "grad_norm": 1.0416901694079614, + "grad_norm": 1.039284088221837, "learning_rate": 4.303898060448989e-06, - "loss": 0.1764, + "loss": 0.1761, "step": 3980 }, { "epoch": 0.5445964432284541, - "grad_norm": 1.5689139507784757, + "grad_norm": 1.567233917575007, "learning_rate": 4.301770219185e-06, - "loss": 0.2304, + "loss": 0.234, "step": 3981 }, { "epoch": 0.5447332421340629, - "grad_norm": 1.3223041500871666, + "grad_norm": 1.313831520059274, "learning_rate": 4.299642506883484e-06, - "loss": 0.1925, + "loss": 0.1944, "step": 3982 }, { "epoch": 0.5448700410396717, - "grad_norm": 1.2543678171845078, + "grad_norm": 1.24577756892802, "learning_rate": 4.297514923937429e-06, - "loss": 0.2053, + "loss": 0.2043, "step": 3983 }, { "epoch": 0.5450068399452804, - "grad_norm": 0.7509500732835692, + "grad_norm": 0.7510083654305777, "learning_rate": 4.295387470739797e-06, - "loss": 0.129, + "loss": 0.1295, "step": 3984 }, { "epoch": 0.5451436388508892, - "grad_norm": 1.4488127906439803, + "grad_norm": 1.4411229564489343, "learning_rate": 4.293260147683525e-06, - "loss": 0.2282, + "loss": 0.2268, "step": 3985 }, { "epoch": 0.5452804377564979, - "grad_norm": 1.1806525560642587, + "grad_norm": 1.1771963405170727, "learning_rate": 4.291132955161531e-06, "loss": 0.21, "step": 3986 }, { "epoch": 0.5454172366621067, - "grad_norm": 1.1427829482479328, + "grad_norm": 1.1434336923148627, "learning_rate": 4.289005893566703e-06, - "loss": 0.1478, + "loss": 0.1475, "step": 3987 }, { "epoch": 0.5455540355677154, - "grad_norm": 1.1823326129310758, + "grad_norm": 1.1616738914341251, "learning_rate": 4.28687896329191e-06, - "loss": 0.1782, + "loss": 0.1777, "step": 3988 }, { "epoch": 0.5456908344733242, - "grad_norm": 1.2698267900221172, + "grad_norm": 1.2484408272370227, "learning_rate": 4.284752164729993e-06, - "loss": 0.1752, + "loss": 0.1739, "step": 3989 }, { "epoch": 0.5458276333789329, - "grad_norm": 1.028085052581018, + "grad_norm": 0.9988982421413948, "learning_rate": 4.282625498273769e-06, - "loss": 0.1574, + "loss": 0.1563, "step": 3990 }, { "epoch": 0.5459644322845417, - "grad_norm": 1.409627122663398, + "grad_norm": 1.3807866302971332, "learning_rate": 4.280498964316032e-06, - "loss": 0.2213, + "loss": 0.217, "step": 3991 }, { "epoch": 0.5461012311901505, - "grad_norm": 1.2121020959253141, + "grad_norm": 1.194419478929369, "learning_rate": 4.278372563249552e-06, - "loss": 0.1986, + "loss": 0.1972, "step": 3992 }, { "epoch": 0.5462380300957592, - "grad_norm": 1.327448182420552, + "grad_norm": 1.3102157590055015, "learning_rate": 4.276246295467075e-06, - "loss": 0.2249, + "loss": 0.2223, "step": 3993 }, { "epoch": 0.546374829001368, - "grad_norm": 1.2606617364085215, + "grad_norm": 1.2467971780040894, "learning_rate": 4.274120161361317e-06, - "loss": 0.1818, + "loss": 0.1809, "step": 3994 }, { "epoch": 0.5465116279069767, - "grad_norm": 1.1197920524782767, + "grad_norm": 1.1083949907666697, "learning_rate": 4.271994161324977e-06, - "loss": 0.1845, + "loss": 0.1852, "step": 3995 }, { "epoch": 0.5466484268125855, - "grad_norm": 1.2431951390030938, + "grad_norm": 1.2336330366205048, "learning_rate": 4.269868295750722e-06, - "loss": 0.1925, + "loss": 0.1916, "step": 3996 }, { "epoch": 0.5467852257181942, - "grad_norm": 1.3307973829699562, + "grad_norm": 1.3174963607894379, "learning_rate": 4.267742565031201e-06, - "loss": 0.1813, + "loss": 0.1808, "step": 3997 }, { "epoch": 0.546922024623803, - "grad_norm": 1.5204645695591363, + "grad_norm": 1.5075719076680745, "learning_rate": 4.265616969559032e-06, - "loss": 0.2339, + "loss": 0.2336, "step": 3998 }, { "epoch": 0.5470588235294118, - "grad_norm": 1.1559196676494887, + "grad_norm": 1.1331587488553314, "learning_rate": 4.263491509726812e-06, - "loss": 0.1723, + "loss": 0.1716, "step": 3999 }, { "epoch": 0.5471956224350205, - "grad_norm": 1.2694963013200002, + "grad_norm": 1.2528355690021002, "learning_rate": 4.261366185927114e-06, - "loss": 0.1871, + "loss": 0.1873, "step": 4000 }, { "epoch": 0.5471956224350205, - "eval_loss": 0.18357780575752258, - "eval_runtime": 5.9201, - "eval_samples_per_second": 5.067, + "eval_loss": 0.18331144750118256, + "eval_runtime": 5.9231, + "eval_samples_per_second": 5.065, "eval_steps_per_second": 1.351, "step": 4000 }, { "epoch": 0.5473324213406293, - "grad_norm": 1.1569903570433107, + "grad_norm": 1.139261219022772, "learning_rate": 4.259240998552479e-06, - "loss": 0.1779, + "loss": 0.1784, "step": 4001 }, { "epoch": 0.547469220246238, - "grad_norm": 1.3826880969554052, + "grad_norm": 1.3756328635956157, "learning_rate": 4.257115947995431e-06, - "loss": 0.2154, + "loss": 0.2162, "step": 4002 }, { "epoch": 0.5476060191518468, - "grad_norm": 1.1044989364376119, + "grad_norm": 1.101976021230098, "learning_rate": 4.254991034648462e-06, - "loss": 0.1815, + "loss": 0.1816, "step": 4003 }, { "epoch": 0.5477428180574555, - "grad_norm": 1.431675057703973, + "grad_norm": 1.3785767679799124, "learning_rate": 4.252866258904045e-06, - "loss": 0.2065, + "loss": 0.2023, "step": 4004 }, { "epoch": 0.5478796169630643, - "grad_norm": 1.0983026241214064, + "grad_norm": 1.0683834423739529, "learning_rate": 4.2507416211546215e-06, - "loss": 0.188, + "loss": 0.1881, "step": 4005 }, { "epoch": 0.548016415868673, - "grad_norm": 1.3401776631460676, + "grad_norm": 1.3705128715327979, "learning_rate": 4.2486171217926145e-06, - "loss": 0.1867, + "loss": 0.1944, "step": 4006 }, { "epoch": 0.5481532147742818, - "grad_norm": 1.1255489675813963, + "grad_norm": 1.1040434872263964, "learning_rate": 4.246492761210412e-06, - "loss": 0.1669, + "loss": 0.1665, "step": 4007 }, { "epoch": 0.5482900136798906, - "grad_norm": 1.258764759107227, + "grad_norm": 1.2263295329872876, "learning_rate": 4.244368539800383e-06, - "loss": 0.1834, + "loss": 0.1837, "step": 4008 }, { "epoch": 0.5484268125854993, - "grad_norm": 1.3027194002626223, + "grad_norm": 1.2767830636292068, "learning_rate": 4.242244457954875e-06, - "loss": 0.213, + "loss": 0.2116, "step": 4009 }, { "epoch": 0.5485636114911081, - "grad_norm": 1.4250004035250732, + "grad_norm": 1.4000032089098795, "learning_rate": 4.240120516066197e-06, - "loss": 0.2093, + "loss": 0.2086, "step": 4010 }, { "epoch": 0.5487004103967168, - "grad_norm": 1.3085810266110374, + "grad_norm": 1.3059311612086586, "learning_rate": 4.237996714526642e-06, - "loss": 0.1679, + "loss": 0.1685, "step": 4011 }, { "epoch": 0.5488372093023256, - "grad_norm": 1.2282663451829503, + "grad_norm": 1.2190304279962143, "learning_rate": 4.235873053728475e-06, - "loss": 0.1713, + "loss": 0.1723, "step": 4012 }, { "epoch": 0.5489740082079343, - "grad_norm": 1.355390563088122, + "grad_norm": 1.3165893250627192, "learning_rate": 4.233749534063936e-06, - "loss": 0.2148, + "loss": 0.2116, "step": 4013 }, { "epoch": 0.5491108071135431, - "grad_norm": 1.3708197150487216, + "grad_norm": 1.3409749898960952, "learning_rate": 4.231626155925233e-06, - "loss": 0.214, + "loss": 0.2137, "step": 4014 }, { "epoch": 0.5492476060191519, - "grad_norm": 1.1713200880296326, + "grad_norm": 1.1672717812602003, "learning_rate": 4.229502919704554e-06, - "loss": 0.1926, + "loss": 0.1932, "step": 4015 }, { "epoch": 0.5493844049247606, - "grad_norm": 1.4959165379718542, + "grad_norm": 1.4818969185395887, "learning_rate": 4.227379825794063e-06, - "loss": 0.2371, + "loss": 0.2389, "step": 4016 }, { "epoch": 0.5495212038303694, - "grad_norm": 1.2263384292964348, + "grad_norm": 1.2269604975333595, "learning_rate": 4.225256874585888e-06, - "loss": 0.1789, + "loss": 0.1778, "step": 4017 }, { "epoch": 0.5496580027359781, - "grad_norm": 1.4038254430738037, + "grad_norm": 1.3944723911271428, "learning_rate": 4.2231340664721395e-06, - "loss": 0.2032, + "loss": 0.2035, "step": 4018 }, { "epoch": 0.5497948016415869, - "grad_norm": 1.2333323146622097, + "grad_norm": 1.2574690271037, "learning_rate": 4.221011401844898e-06, - "loss": 0.1679, + "loss": 0.1708, "step": 4019 }, { "epoch": 0.5499316005471956, - "grad_norm": 1.4941062783930044, + "grad_norm": 1.4693260751255033, "learning_rate": 4.218888881096217e-06, - "loss": 0.2427, + "loss": 0.2441, "step": 4020 }, { "epoch": 0.5500683994528044, - "grad_norm": 1.3629277081285807, + "grad_norm": 1.3314738600254445, "learning_rate": 4.2167665046181245e-06, - "loss": 0.202, + "loss": 0.1993, "step": 4021 }, { "epoch": 0.5502051983584131, - "grad_norm": 1.2734273418016053, + "grad_norm": 1.252385296788403, "learning_rate": 4.214644272802624e-06, - "loss": 0.2098, + "loss": 0.2084, "step": 4022 }, { "epoch": 0.5503419972640219, - "grad_norm": 1.319822754938478, + "grad_norm": 1.3097350019956413, "learning_rate": 4.212522186041686e-06, - "loss": 0.2273, + "loss": 0.2271, "step": 4023 }, { "epoch": 0.5504787961696307, - "grad_norm": 1.3612038539080946, + "grad_norm": 1.3640986598648357, "learning_rate": 4.21040024472726e-06, - "loss": 0.2025, + "loss": 0.2035, "step": 4024 }, { "epoch": 0.5506155950752394, - "grad_norm": 1.1690994412827367, + "grad_norm": 1.165513729713374, "learning_rate": 4.20827844925127e-06, - "loss": 0.1999, + "loss": 0.2011, "step": 4025 }, { "epoch": 0.5507523939808482, - "grad_norm": 1.152665544267522, + "grad_norm": 1.1281539011314692, "learning_rate": 4.2061568000056045e-06, - "loss": 0.1554, + "loss": 0.1535, "step": 4026 }, { "epoch": 0.5508891928864569, - "grad_norm": 1.3794778669675232, + "grad_norm": 1.3645102692400806, "learning_rate": 4.204035297382134e-06, - "loss": 0.1975, + "loss": 0.1947, "step": 4027 }, { "epoch": 0.5510259917920657, - "grad_norm": 1.2741252337025837, + "grad_norm": 1.2744424812470987, "learning_rate": 4.2019139417726965e-06, - "loss": 0.171, + "loss": 0.1716, "step": 4028 }, { "epoch": 0.5511627906976744, - "grad_norm": 1.2582309021258349, + "grad_norm": 1.2541363532351928, "learning_rate": 4.199792733569108e-06, - "loss": 0.1691, + "loss": 0.1703, "step": 4029 }, { "epoch": 0.5512995896032832, - "grad_norm": 1.234369514179039, + "grad_norm": 1.217521415136089, "learning_rate": 4.1976716731631476e-06, - "loss": 0.1798, + "loss": 0.1801, "step": 4030 }, { "epoch": 0.551436388508892, - "grad_norm": 1.259516284932011, + "grad_norm": 1.2435964519582179, "learning_rate": 4.195550760946578e-06, - "loss": 0.2164, + "loss": 0.2167, "step": 4031 }, { "epoch": 0.5515731874145007, - "grad_norm": 1.2355350571070922, + "grad_norm": 1.2039503478747342, "learning_rate": 4.193429997311133e-06, - "loss": 0.1927, + "loss": 0.1917, "step": 4032 }, { "epoch": 0.5517099863201095, - "grad_norm": 1.5499319087343184, + "grad_norm": 1.4580403201479366, "learning_rate": 4.191309382648508e-06, - "loss": 0.2103, + "loss": 0.2093, "step": 4033 }, { "epoch": 0.5518467852257182, - "grad_norm": 1.2376539290053234, + "grad_norm": 1.1953547329765657, "learning_rate": 4.1891889173503845e-06, - "loss": 0.162, + "loss": 0.16, "step": 4034 }, { "epoch": 0.551983584131327, - "grad_norm": 1.2241905716808241, + "grad_norm": 1.2000993146180103, "learning_rate": 4.187068601808408e-06, - "loss": 0.1833, + "loss": 0.1831, "step": 4035 }, { "epoch": 0.5521203830369357, - "grad_norm": 1.2146703838975594, + "grad_norm": 1.1980909029725357, "learning_rate": 4.184948436414203e-06, - "loss": 0.193, + "loss": 0.1941, "step": 4036 }, { "epoch": 0.5522571819425445, - "grad_norm": 1.3002192242086916, + "grad_norm": 1.3013666761640743, "learning_rate": 4.1828284215593565e-06, - "loss": 0.2011, + "loss": 0.2022, "step": 4037 }, { "epoch": 0.5523939808481532, - "grad_norm": 1.2754746069718752, + "grad_norm": 1.248617246905716, "learning_rate": 4.180708557635439e-06, - "loss": 0.2073, + "loss": 0.2075, "step": 4038 }, { "epoch": 0.552530779753762, - "grad_norm": 1.2919088775723722, + "grad_norm": 1.2824627725844453, "learning_rate": 4.178588845033983e-06, - "loss": 0.1885, + "loss": 0.1895, "step": 4039 }, { "epoch": 0.5526675786593708, - "grad_norm": 1.1256687718742149, + "grad_norm": 1.1099849776741202, "learning_rate": 4.1764692841465e-06, - "loss": 0.178, + "loss": 0.177, "step": 4040 }, { "epoch": 0.5528043775649795, - "grad_norm": 1.181881625996629, + "grad_norm": 1.1644642826743656, "learning_rate": 4.174349875364472e-06, - "loss": 0.1804, + "loss": 0.1816, "step": 4041 }, { "epoch": 0.5529411764705883, - "grad_norm": 1.3749244486646928, + "grad_norm": 1.367561695509914, "learning_rate": 4.17223061907935e-06, - "loss": 0.2277, + "loss": 0.2304, "step": 4042 }, { "epoch": 0.553077975376197, - "grad_norm": 1.1583854096919923, + "grad_norm": 1.1429516039987657, "learning_rate": 4.1701115156825625e-06, - "loss": 0.1634, + "loss": 0.1633, "step": 4043 }, { "epoch": 0.5532147742818058, - "grad_norm": 1.1601308142747715, + "grad_norm": 1.1440061489375513, "learning_rate": 4.167992565565501e-06, - "loss": 0.1819, + "loss": 0.1812, "step": 4044 }, { "epoch": 0.5533515731874145, - "grad_norm": 1.1232318773940468, + "grad_norm": 1.1155435216873082, "learning_rate": 4.165873769119539e-06, - "loss": 0.1796, + "loss": 0.1795, "step": 4045 }, { "epoch": 0.5534883720930233, - "grad_norm": 1.3898628409406841, + "grad_norm": 1.3857379902859552, "learning_rate": 4.163755126736011e-06, - "loss": 0.2181, + "loss": 0.2187, "step": 4046 }, { "epoch": 0.5536251709986321, - "grad_norm": 1.5270097207159297, + "grad_norm": 1.5095150011708705, "learning_rate": 4.161636638806233e-06, - "loss": 0.2135, + "loss": 0.2138, "step": 4047 }, { "epoch": 0.5537619699042408, - "grad_norm": 1.2811336943334302, + "grad_norm": 1.2635445245729298, "learning_rate": 4.159518305721487e-06, - "loss": 0.2311, + "loss": 0.2306, "step": 4048 }, { "epoch": 0.5538987688098496, - "grad_norm": 1.2479758099588634, + "grad_norm": 1.233057020609232, "learning_rate": 4.157400127873026e-06, - "loss": 0.2145, + "loss": 0.2124, "step": 4049 }, { "epoch": 0.5540355677154583, - "grad_norm": 1.0712155386054039, + "grad_norm": 1.0643471821158126, "learning_rate": 4.1552821056520795e-06, - "loss": 0.1685, + "loss": 0.1689, "step": 4050 }, { "epoch": 0.5541723666210671, - "grad_norm": 1.2218605358805588, + "grad_norm": 1.225014481334767, "learning_rate": 4.153164239449839e-06, - "loss": 0.1871, + "loss": 0.1883, "step": 4051 }, { "epoch": 0.5543091655266758, - "grad_norm": 1.2033262218694152, + "grad_norm": 1.1800737347752426, "learning_rate": 4.151046529657477e-06, - "loss": 0.1652, + "loss": 0.1653, "step": 4052 }, { "epoch": 0.5544459644322846, - "grad_norm": 1.2814813593299157, + "grad_norm": 1.279323300406916, "learning_rate": 4.14892897666613e-06, - "loss": 0.1807, + "loss": 0.1827, "step": 4053 }, { "epoch": 0.5545827633378932, - "grad_norm": 1.1804377986600556, + "grad_norm": 1.157130255408328, "learning_rate": 4.146811580866909e-06, - "loss": 0.1553, + "loss": 0.1535, "step": 4054 }, { "epoch": 0.554719562243502, - "grad_norm": 1.0116110719710258, + "grad_norm": 0.9983116368363987, "learning_rate": 4.144694342650896e-06, - "loss": 0.139, + "loss": 0.1388, "step": 4055 }, { "epoch": 0.5548563611491109, - "grad_norm": 1.2294968957979648, + "grad_norm": 1.2372220224847867, "learning_rate": 4.142577262409144e-06, - "loss": 0.1507, + "loss": 0.1526, "step": 4056 }, { "epoch": 0.5549931600547195, - "grad_norm": 1.1388963499584897, + "grad_norm": 1.147115594935378, "learning_rate": 4.140460340532675e-06, - "loss": 0.1689, + "loss": 0.1708, "step": 4057 }, { "epoch": 0.5551299589603284, - "grad_norm": 1.3794040466852462, + "grad_norm": 1.360363701755019, "learning_rate": 4.13834357741248e-06, - "loss": 0.199, + "loss": 0.2003, "step": 4058 }, { "epoch": 0.555266757865937, - "grad_norm": 0.9131769802010723, + "grad_norm": 0.9067249437358285, "learning_rate": 4.13622697343953e-06, - "loss": 0.1445, + "loss": 0.1456, "step": 4059 }, { "epoch": 0.5554035567715458, - "grad_norm": 1.325340128279155, + "grad_norm": 1.3252592383768877, "learning_rate": 4.134110529004753e-06, - "loss": 0.1977, + "loss": 0.1969, "step": 4060 }, { "epoch": 0.5555403556771545, - "grad_norm": 1.5950493783898092, + "grad_norm": 1.5723390882601105, "learning_rate": 4.13199424449906e-06, - "loss": 0.1985, + "loss": 0.1963, "step": 4061 }, { "epoch": 0.5556771545827633, - "grad_norm": 1.4568774260727861, + "grad_norm": 1.4459085200524604, "learning_rate": 4.129878120313323e-06, - "loss": 0.2147, + "loss": 0.2155, "step": 4062 }, { "epoch": 0.5558139534883721, - "grad_norm": 1.502323477771088, + "grad_norm": 1.4844959938889133, "learning_rate": 4.12776215683839e-06, - "loss": 0.2279, + "loss": 0.2275, "step": 4063 }, { "epoch": 0.5559507523939808, - "grad_norm": 1.2289559340494476, + "grad_norm": 1.216783940431708, "learning_rate": 4.125646354465078e-06, - "loss": 0.1822, + "loss": 0.1807, "step": 4064 }, { "epoch": 0.5560875512995896, - "grad_norm": 1.3143987417396932, + "grad_norm": 1.2773817136918884, "learning_rate": 4.123530713584174e-06, - "loss": 0.2106, + "loss": 0.2101, "step": 4065 }, { "epoch": 0.5562243502051983, - "grad_norm": 1.2129530827723496, + "grad_norm": 1.2011594090905724, "learning_rate": 4.1214152345864365e-06, - "loss": 0.1662, + "loss": 0.1663, "step": 4066 }, { "epoch": 0.5563611491108071, - "grad_norm": 1.3966069134905799, + "grad_norm": 1.367851271059913, "learning_rate": 4.11929991786259e-06, - "loss": 0.2354, + "loss": 0.2332, "step": 4067 }, { "epoch": 0.5564979480164158, - "grad_norm": 1.1842175983749483, + "grad_norm": 1.1758779173477294, "learning_rate": 4.117184763803334e-06, - "loss": 0.1899, + "loss": 0.1897, "step": 4068 }, { "epoch": 0.5566347469220246, - "grad_norm": 1.3381493932370654, + "grad_norm": 1.3684113655578931, "learning_rate": 4.115069772799333e-06, - "loss": 0.211, + "loss": 0.2152, "step": 4069 }, { "epoch": 0.5567715458276333, - "grad_norm": 1.137002189973844, + "grad_norm": 1.1349019758232988, "learning_rate": 4.112954945241228e-06, - "loss": 0.1976, + "loss": 0.1983, "step": 4070 }, { "epoch": 0.5569083447332421, - "grad_norm": 1.2090447237608954, + "grad_norm": 1.207450546717187, "learning_rate": 4.110840281519621e-06, - "loss": 0.2031, + "loss": 0.2041, "step": 4071 }, { "epoch": 0.5570451436388509, - "grad_norm": 1.321762533731262, + "grad_norm": 1.2967162093588527, "learning_rate": 4.108725782025092e-06, - "loss": 0.1813, + "loss": 0.1808, "step": 4072 }, { "epoch": 0.5571819425444596, - "grad_norm": 1.231997587472303, + "grad_norm": 1.2072696937186533, "learning_rate": 4.10661144714819e-06, - "loss": 0.1845, + "loss": 0.1848, "step": 4073 }, { "epoch": 0.5573187414500684, - "grad_norm": 1.2772742833113067, + "grad_norm": 1.237367377110116, "learning_rate": 4.104497277279425e-06, - "loss": 0.1904, + "loss": 0.1887, "step": 4074 }, { "epoch": 0.5574555403556771, - "grad_norm": 1.459802221574218, + "grad_norm": 1.4295822039953303, "learning_rate": 4.102383272809287e-06, - "loss": 0.2186, + "loss": 0.2189, "step": 4075 }, { "epoch": 0.5575923392612859, - "grad_norm": 1.2852733194016792, + "grad_norm": 1.219359565194432, "learning_rate": 4.1002694341282275e-06, - "loss": 0.1968, + "loss": 0.1954, "step": 4076 }, { "epoch": 0.5577291381668946, - "grad_norm": 1.4283228968201542, + "grad_norm": 1.3924408202024932, "learning_rate": 4.098155761626674e-06, - "loss": 0.2034, + "loss": 0.202, "step": 4077 }, { "epoch": 0.5578659370725034, - "grad_norm": 1.1880261747999687, + "grad_norm": 1.1841207588714664, "learning_rate": 4.096042255695017e-06, - "loss": 0.1829, + "loss": 0.1845, "step": 4078 }, { "epoch": 0.5580027359781122, - "grad_norm": 1.0101796931759874, + "grad_norm": 0.9914293073141048, "learning_rate": 4.0939289167236196e-06, - "loss": 0.1733, + "loss": 0.172, "step": 4079 }, { "epoch": 0.5581395348837209, - "grad_norm": 1.185458188186279, + "grad_norm": 1.1792216005543101, "learning_rate": 4.091815745102818e-06, - "loss": 0.1537, + "loss": 0.1546, "step": 4080 }, { "epoch": 0.5582763337893297, - "grad_norm": 1.1964763507921763, + "grad_norm": 1.1960602173007484, "learning_rate": 4.089702741222909e-06, - "loss": 0.2026, + "loss": 0.2031, "step": 4081 }, { "epoch": 0.5584131326949384, - "grad_norm": 1.3256991524746145, + "grad_norm": 1.328086975662496, "learning_rate": 4.087589905474165e-06, - "loss": 0.1897, + "loss": 0.189, "step": 4082 }, { "epoch": 0.5585499316005472, - "grad_norm": 1.2269284808889553, + "grad_norm": 1.2359152849672674, "learning_rate": 4.085477238246823e-06, - "loss": 0.189, + "loss": 0.1937, "step": 4083 }, { "epoch": 0.5586867305061559, - "grad_norm": 1.1437370052649038, + "grad_norm": 1.1296554580172753, "learning_rate": 4.083364739931092e-06, - "loss": 0.1575, + "loss": 0.1576, "step": 4084 }, { "epoch": 0.5588235294117647, - "grad_norm": 1.2901104362017366, + "grad_norm": 1.2571676105239886, "learning_rate": 4.081252410917148e-06, - "loss": 0.1958, + "loss": 0.1949, "step": 4085 }, { "epoch": 0.5589603283173734, - "grad_norm": 1.3239425139421712, + "grad_norm": 1.307582360602467, "learning_rate": 4.07914025159514e-06, - "loss": 0.1919, + "loss": 0.1916, "step": 4086 }, { "epoch": 0.5590971272229822, - "grad_norm": 1.4473027696414935, + "grad_norm": 1.4435762252666569, "learning_rate": 4.077028262355176e-06, - "loss": 0.2076, + "loss": 0.2073, "step": 4087 }, { "epoch": 0.559233926128591, - "grad_norm": 1.267161204247247, + "grad_norm": 1.2609760020937006, "learning_rate": 4.074916443587343e-06, - "loss": 0.1912, + "loss": 0.1915, "step": 4088 }, { "epoch": 0.5593707250341997, - "grad_norm": 1.406385224982183, + "grad_norm": 1.3866011887051, "learning_rate": 4.072804795681692e-06, - "loss": 0.205, + "loss": 0.2048, "step": 4089 }, { "epoch": 0.5595075239398085, - "grad_norm": 1.344244004634498, + "grad_norm": 1.312357848157271, "learning_rate": 4.070693319028241e-06, - "loss": 0.1999, + "loss": 0.1993, "step": 4090 }, { "epoch": 0.5596443228454172, - "grad_norm": 1.1500848578956497, + "grad_norm": 1.1413877882574834, "learning_rate": 4.068582014016979e-06, - "loss": 0.1879, + "loss": 0.1897, "step": 4091 }, { "epoch": 0.559781121751026, - "grad_norm": 1.1531247717720392, + "grad_norm": 1.13834960534452, "learning_rate": 4.066470881037863e-06, - "loss": 0.1779, + "loss": 0.18, "step": 4092 }, { "epoch": 0.5599179206566347, - "grad_norm": 1.3848207369843044, + "grad_norm": 1.3294174800465066, "learning_rate": 4.064359920480816e-06, - "loss": 0.1917, + "loss": 0.1901, "step": 4093 }, { "epoch": 0.5600547195622435, - "grad_norm": 1.2921328002577384, + "grad_norm": 1.2790679037809185, "learning_rate": 4.062249132735731e-06, - "loss": 0.175, + "loss": 0.1766, "step": 4094 }, { "epoch": 0.5601915184678523, - "grad_norm": 1.4005221703806163, + "grad_norm": 1.4140777443493184, "learning_rate": 4.060138518192468e-06, - "loss": 0.2136, + "loss": 0.2149, "step": 4095 }, { "epoch": 0.560328317373461, - "grad_norm": 1.194819566898446, + "grad_norm": 1.1944039967929532, "learning_rate": 4.058028077240861e-06, - "loss": 0.1713, + "loss": 0.172, "step": 4096 }, { "epoch": 0.5604651162790698, - "grad_norm": 1.2185899276233079, + "grad_norm": 1.2141339480043492, "learning_rate": 4.055917810270699e-06, - "loss": 0.1813, + "loss": 0.1811, "step": 4097 }, { "epoch": 0.5606019151846785, - "grad_norm": 1.218496172541667, + "grad_norm": 1.2026035605859342, "learning_rate": 4.053807717671753e-06, - "loss": 0.1797, + "loss": 0.1788, "step": 4098 }, { "epoch": 0.5607387140902873, - "grad_norm": 1.2684159811936937, + "grad_norm": 1.2490849321383115, "learning_rate": 4.0516977998337505e-06, - "loss": 0.2085, + "loss": 0.2082, "step": 4099 }, { "epoch": 0.560875512995896, - "grad_norm": 1.1620723070653352, + "grad_norm": 1.1471152510427243, "learning_rate": 4.049588057146395e-06, - "loss": 0.1568, + "loss": 0.1578, "step": 4100 }, { "epoch": 0.560875512995896, - "eval_loss": 0.1833495795726776, - "eval_runtime": 5.9252, - "eval_samples_per_second": 5.063, - "eval_steps_per_second": 1.35, + "eval_loss": 0.18334245681762695, + "eval_runtime": 5.9173, + "eval_samples_per_second": 5.07, + "eval_steps_per_second": 1.352, "step": 4100 }, { "epoch": 0.5610123119015048, - "grad_norm": 1.5029021385971888, + "grad_norm": 1.5692034734698774, "learning_rate": 4.0474784899993515e-06, - "loss": 0.2027, + "loss": 0.2063, "step": 4101 }, { "epoch": 0.5611491108071135, - "grad_norm": 1.1278232123866494, + "grad_norm": 1.1153653404455592, "learning_rate": 4.045369098782259e-06, - "loss": 0.1971, + "loss": 0.1977, "step": 4102 }, { "epoch": 0.5612859097127223, - "grad_norm": 1.2198466134524244, + "grad_norm": 1.2066146494537766, "learning_rate": 4.043259883884717e-06, "loss": 0.1637, "step": 4103 }, { "epoch": 0.5614227086183311, - "grad_norm": 1.1151108544689592, + "grad_norm": 1.1136046254398368, "learning_rate": 4.0411508456962965e-06, - "loss": 0.181, + "loss": 0.1813, "step": 4104 }, { "epoch": 0.5615595075239398, - "grad_norm": 1.1139171525916953, + "grad_norm": 1.1120372442909383, "learning_rate": 4.039041984606538e-06, - "loss": 0.1536, + "loss": 0.154, "step": 4105 }, { "epoch": 0.5616963064295486, - "grad_norm": 1.4066467883638556, + "grad_norm": 1.3779042614900037, "learning_rate": 4.036933301004941e-06, - "loss": 0.2069, + "loss": 0.2064, "step": 4106 }, { "epoch": 0.5618331053351573, - "grad_norm": 1.409067285655184, + "grad_norm": 1.3824551174002089, "learning_rate": 4.034824795280983e-06, "loss": 0.2063, "step": 4107 }, { "epoch": 0.5619699042407661, - "grad_norm": 1.4023073007843245, + "grad_norm": 1.3893527582377707, "learning_rate": 4.0327164678241e-06, - "loss": 0.2099, + "loss": 0.2096, "step": 4108 }, { "epoch": 0.5621067031463748, - "grad_norm": 1.2744508254857283, + "grad_norm": 1.2869658079344537, "learning_rate": 4.0306083190237e-06, - "loss": 0.1981, + "loss": 0.1977, "step": 4109 }, { "epoch": 0.5622435020519836, - "grad_norm": 1.310787925506137, + "grad_norm": 1.2676473783733464, "learning_rate": 4.028500349269156e-06, - "loss": 0.1972, + "loss": 0.1935, "step": 4110 }, { "epoch": 0.5623803009575924, - "grad_norm": 1.292475645144297, + "grad_norm": 1.2702804847034859, "learning_rate": 4.026392558949806e-06, - "loss": 0.2184, + "loss": 0.2182, "step": 4111 }, { "epoch": 0.5625170998632011, - "grad_norm": 1.0747169174524802, + "grad_norm": 1.0773957765926143, "learning_rate": 4.024284948454962e-06, - "loss": 0.1448, + "loss": 0.1449, "step": 4112 }, { "epoch": 0.5626538987688099, - "grad_norm": 0.8863288397039221, + "grad_norm": 0.8902258197947128, "learning_rate": 4.0221775181738935e-06, - "loss": 0.1572, + "loss": 0.1563, "step": 4113 }, { "epoch": 0.5627906976744186, - "grad_norm": 1.1517959856386772, + "grad_norm": 1.1001717948409653, "learning_rate": 4.020070268495844e-06, - "loss": 0.1477, + "loss": 0.1456, "step": 4114 }, { "epoch": 0.5629274965800274, - "grad_norm": 1.1201344568979885, + "grad_norm": 1.110025858342508, "learning_rate": 4.017963199810018e-06, - "loss": 0.1988, + "loss": 0.2005, "step": 4115 }, { "epoch": 0.5630642954856361, - "grad_norm": 1.322822537737897, + "grad_norm": 1.3054465786869056, "learning_rate": 4.015856312505593e-06, - "loss": 0.1824, + "loss": 0.1839, "step": 4116 }, { "epoch": 0.5632010943912449, - "grad_norm": 0.9512908837515228, + "grad_norm": 0.9484301585417385, "learning_rate": 4.013749606971706e-06, - "loss": 0.1569, + "loss": 0.158, "step": 4117 }, { "epoch": 0.5633378932968536, - "grad_norm": 1.1437601580701375, + "grad_norm": 1.1385623351589371, "learning_rate": 4.011643083597467e-06, - "loss": 0.1628, + "loss": 0.1641, "step": 4118 }, { "epoch": 0.5634746922024624, - "grad_norm": 1.046103011400623, + "grad_norm": 1.0463404596598747, "learning_rate": 4.009536742771944e-06, - "loss": 0.1628, + "loss": 0.1624, "step": 4119 }, { "epoch": 0.5636114911080712, - "grad_norm": 1.3555153406227918, + "grad_norm": 1.3530597784093876, "learning_rate": 4.007430584884181e-06, - "loss": 0.1992, + "loss": 0.1971, "step": 4120 }, { "epoch": 0.5637482900136799, - "grad_norm": 1.2688661357106081, + "grad_norm": 1.2473513826637588, "learning_rate": 4.005324610323184e-06, - "loss": 0.197, + "loss": 0.1984, "step": 4121 }, { "epoch": 0.5638850889192887, - "grad_norm": 1.1478310498415316, + "grad_norm": 1.124572662764147, "learning_rate": 4.003218819477923e-06, - "loss": 0.1937, + "loss": 0.1927, "step": 4122 }, { "epoch": 0.5640218878248974, - "grad_norm": 1.2705597340179338, + "grad_norm": 1.2653934375956883, "learning_rate": 4.001113212737335e-06, - "loss": 0.2243, + "loss": 0.223, "step": 4123 }, { "epoch": 0.5641586867305062, - "grad_norm": 1.281658637823631, + "grad_norm": 1.2673118466617774, "learning_rate": 3.999007790490325e-06, - "loss": 0.1811, + "loss": 0.1799, "step": 4124 }, { "epoch": 0.5642954856361149, - "grad_norm": 1.2848630146466309, + "grad_norm": 1.2977048051692828, "learning_rate": 3.996902553125764e-06, - "loss": 0.2122, + "loss": 0.2153, "step": 4125 }, { "epoch": 0.5644322845417237, - "grad_norm": 1.252161062683459, + "grad_norm": 1.253882712366532, "learning_rate": 3.994797501032485e-06, - "loss": 0.1701, + "loss": 0.1691, "step": 4126 }, { "epoch": 0.5645690834473325, - "grad_norm": 1.2589218288791733, + "grad_norm": 1.2079271115848178, "learning_rate": 3.99269263459929e-06, - "loss": 0.183, + "loss": 0.179, "step": 4127 }, { "epoch": 0.5647058823529412, - "grad_norm": 0.961316485684201, + "grad_norm": 0.9668351329463669, "learning_rate": 3.99058795421495e-06, - "loss": 0.1498, + "loss": 0.1501, "step": 4128 }, { "epoch": 0.56484268125855, - "grad_norm": 1.3557358584419243, + "grad_norm": 1.3690817946039444, "learning_rate": 3.988483460268192e-06, - "loss": 0.1936, + "loss": 0.1932, "step": 4129 }, { "epoch": 0.5649794801641587, - "grad_norm": 1.2303311662688077, + "grad_norm": 1.2075701249549557, "learning_rate": 3.986379153147718e-06, - "loss": 0.1704, + "loss": 0.1711, "step": 4130 }, { "epoch": 0.5651162790697675, - "grad_norm": 1.2375094971257803, + "grad_norm": 1.2184398786773434, "learning_rate": 3.98427503324219e-06, - "loss": 0.1932, + "loss": 0.1906, "step": 4131 }, { "epoch": 0.5652530779753762, - "grad_norm": 1.2759180098042793, + "grad_norm": 1.27279463833232, "learning_rate": 3.982171100940239e-06, - "loss": 0.1966, + "loss": 0.1957, "step": 4132 }, { "epoch": 0.565389876880985, - "grad_norm": 1.1914392324047045, + "grad_norm": 1.1786247984620166, "learning_rate": 3.980067356630458e-06, - "loss": 0.1906, + "loss": 0.1909, "step": 4133 }, { "epoch": 0.5655266757865937, - "grad_norm": 1.6363242802431932, + "grad_norm": 1.6278229154353816, "learning_rate": 3.977963800701408e-06, - "loss": 0.2192, + "loss": 0.2193, "step": 4134 }, { "epoch": 0.5656634746922025, - "grad_norm": 1.2038818963985678, + "grad_norm": 1.2115312178106956, "learning_rate": 3.975860433541613e-06, - "loss": 0.1872, + "loss": 0.1878, "step": 4135 }, { "epoch": 0.5658002735978113, - "grad_norm": 1.3647914181138294, + "grad_norm": 1.3503072389497204, "learning_rate": 3.973757255539563e-06, - "loss": 0.1775, + "loss": 0.1766, "step": 4136 }, { "epoch": 0.56593707250342, - "grad_norm": 1.1934855255331158, + "grad_norm": 1.1905884648279716, "learning_rate": 3.971654267083715e-06, "loss": 0.1878, "step": 4137 }, { "epoch": 0.5660738714090288, - "grad_norm": 1.3114095424044627, + "grad_norm": 1.3121850614143151, "learning_rate": 3.969551468562487e-06, - "loss": 0.1699, + "loss": 0.1712, "step": 4138 }, { "epoch": 0.5662106703146375, - "grad_norm": 1.3342108963258728, + "grad_norm": 1.3198366532191028, "learning_rate": 3.967448860364268e-06, - "loss": 0.1789, + "loss": 0.1777, "step": 4139 }, { "epoch": 0.5663474692202463, - "grad_norm": 1.4973636373976278, + "grad_norm": 1.4921529158502291, "learning_rate": 3.965346442877403e-06, - "loss": 0.2186, + "loss": 0.22, "step": 4140 }, { "epoch": 0.566484268125855, - "grad_norm": 1.562283091263524, + "grad_norm": 1.5375376604528612, "learning_rate": 3.963244216490212e-06, - "loss": 0.2304, + "loss": 0.2312, "step": 4141 }, { "epoch": 0.5666210670314638, - "grad_norm": 1.3976901984333001, + "grad_norm": 1.3834512632993625, "learning_rate": 3.961142181590969e-06, - "loss": 0.2252, + "loss": 0.223, "step": 4142 }, { "epoch": 0.5667578659370726, - "grad_norm": 1.1361282054467765, + "grad_norm": 1.1479208599242092, "learning_rate": 3.959040338567921e-06, - "loss": 0.1786, + "loss": 0.1796, "step": 4143 }, { "epoch": 0.5668946648426813, - "grad_norm": 1.1728930287959536, + "grad_norm": 1.168057078209089, "learning_rate": 3.956938687809278e-06, - "loss": 0.1847, + "loss": 0.1841, "step": 4144 }, { "epoch": 0.56703146374829, - "grad_norm": 1.3979837182100923, + "grad_norm": 1.3952938865118336, "learning_rate": 3.95483722970321e-06, - "loss": 0.2218, + "loss": 0.2221, "step": 4145 }, { "epoch": 0.5671682626538987, - "grad_norm": 1.25956674185487, + "grad_norm": 1.248248837260259, "learning_rate": 3.95273596463786e-06, - "loss": 0.1847, + "loss": 0.1835, "step": 4146 }, { "epoch": 0.5673050615595076, - "grad_norm": 1.321370874965711, + "grad_norm": 1.3158750567281665, "learning_rate": 3.950634893001322e-06, - "loss": 0.2046, + "loss": 0.2055, "step": 4147 }, { "epoch": 0.5674418604651162, - "grad_norm": 1.4567160984445442, + "grad_norm": 1.4590170941132077, "learning_rate": 3.948534015181671e-06, - "loss": 0.2425, + "loss": 0.2456, "step": 4148 }, { "epoch": 0.567578659370725, - "grad_norm": 1.2622002714333427, + "grad_norm": 1.2712957244485636, "learning_rate": 3.946433331566929e-06, - "loss": 0.1989, + "loss": 0.199, "step": 4149 }, { "epoch": 0.5677154582763337, - "grad_norm": 1.520324305736975, + "grad_norm": 1.5314602194449731, "learning_rate": 3.944332842545097e-06, - "loss": 0.2085, + "loss": 0.2073, "step": 4150 }, { "epoch": 0.5678522571819425, - "grad_norm": 1.5279333885597925, + "grad_norm": 1.4989696831099122, "learning_rate": 3.942232548504129e-06, - "loss": 0.2206, + "loss": 0.2186, "step": 4151 }, { "epoch": 0.5679890560875513, - "grad_norm": 1.1868954431122416, + "grad_norm": 1.1742129747702221, "learning_rate": 3.940132449831951e-06, - "loss": 0.1729, + "loss": 0.1715, "step": 4152 }, { "epoch": 0.56812585499316, - "grad_norm": 1.1720149844441592, + "grad_norm": 1.168948539022083, "learning_rate": 3.938032546916449e-06, - "loss": 0.1892, + "loss": 0.1884, "step": 4153 }, { "epoch": 0.5682626538987688, - "grad_norm": 1.2506346460847517, + "grad_norm": 1.249530698150688, "learning_rate": 3.9359328401454715e-06, - "loss": 0.2155, + "loss": 0.2169, "step": 4154 }, { "epoch": 0.5683994528043775, - "grad_norm": 1.1558239383308775, + "grad_norm": 1.1451453419864284, "learning_rate": 3.933833329906836e-06, - "loss": 0.1787, + "loss": 0.178, "step": 4155 }, { "epoch": 0.5685362517099863, - "grad_norm": 1.310792633307934, + "grad_norm": 1.2873874862587484, "learning_rate": 3.931734016588316e-06, - "loss": 0.1807, + "loss": 0.1828, "step": 4156 }, { "epoch": 0.568673050615595, - "grad_norm": 1.3418710927222732, + "grad_norm": 1.3280305455203225, "learning_rate": 3.929634900577656e-06, - "loss": 0.2223, + "loss": 0.2218, "step": 4157 }, { "epoch": 0.5688098495212038, - "grad_norm": 1.3441623231478124, + "grad_norm": 1.3187760818841887, "learning_rate": 3.927535982262558e-06, - "loss": 0.2289, + "loss": 0.2255, "step": 4158 }, { "epoch": 0.5689466484268126, - "grad_norm": 1.3503661337556918, + "grad_norm": 1.3622257390180557, "learning_rate": 3.925437262030694e-06, - "loss": 0.2275, + "loss": 0.2284, "step": 4159 }, { "epoch": 0.5690834473324213, - "grad_norm": 1.0876468822221097, + "grad_norm": 1.1060784768424141, "learning_rate": 3.923338740269696e-06, - "loss": 0.1525, + "loss": 0.1526, "step": 4160 }, { "epoch": 0.5692202462380301, - "grad_norm": 1.1250598746055298, + "grad_norm": 1.109116749353596, "learning_rate": 3.921240417367156e-06, "loss": 0.1668, "step": 4161 }, { "epoch": 0.5693570451436388, - "grad_norm": 1.2756626698131786, + "grad_norm": 1.2706634403857808, "learning_rate": 3.919142293710636e-06, - "loss": 0.1963, + "loss": 0.195, "step": 4162 }, { "epoch": 0.5694938440492476, - "grad_norm": 1.1633460484095774, + "grad_norm": 1.1748247264274538, "learning_rate": 3.917044369687655e-06, - "loss": 0.19, + "loss": 0.1933, "step": 4163 }, { "epoch": 0.5696306429548563, - "grad_norm": 1.1290709609136174, + "grad_norm": 1.1336320423402217, "learning_rate": 3.9149466456856995e-06, - "loss": 0.1737, + "loss": 0.1749, "step": 4164 }, { "epoch": 0.5697674418604651, - "grad_norm": 1.3892588464142583, + "grad_norm": 1.3552618020040068, "learning_rate": 3.912849122092216e-06, - "loss": 0.2026, + "loss": 0.2011, "step": 4165 }, { "epoch": 0.5699042407660738, - "grad_norm": 1.2649442570982599, + "grad_norm": 1.2723016064153954, "learning_rate": 3.910751799294617e-06, - "loss": 0.1886, + "loss": 0.1887, "step": 4166 }, { "epoch": 0.5700410396716826, - "grad_norm": 1.4887967455521258, + "grad_norm": 1.4784225876367871, "learning_rate": 3.908654677680274e-06, - "loss": 0.2327, + "loss": 0.2289, "step": 4167 }, { "epoch": 0.5701778385772914, - "grad_norm": 1.2346383862628836, + "grad_norm": 1.2180665616424198, "learning_rate": 3.906557757636526e-06, - "loss": 0.1484, + "loss": 0.1501, "step": 4168 }, { "epoch": 0.5703146374829001, - "grad_norm": 0.983200656435625, + "grad_norm": 0.9722596664825142, "learning_rate": 3.904461039550673e-06, - "loss": 0.1524, + "loss": 0.1529, "step": 4169 }, { "epoch": 0.5704514363885089, - "grad_norm": 1.1112747544950554, + "grad_norm": 1.1043809236600186, "learning_rate": 3.902364523809975e-06, - "loss": 0.1639, + "loss": 0.165, "step": 4170 }, { "epoch": 0.5705882352941176, - "grad_norm": 1.288336291042992, + "grad_norm": 1.2956717793960597, "learning_rate": 3.9002682108016585e-06, - "loss": 0.2116, + "loss": 0.2145, "step": 4171 }, { "epoch": 0.5707250341997264, - "grad_norm": 1.3634502584352655, + "grad_norm": 1.390957047269795, "learning_rate": 3.898172100912908e-06, - "loss": 0.1953, + "loss": 0.2014, "step": 4172 }, { "epoch": 0.5708618331053351, - "grad_norm": 1.1363658631871743, + "grad_norm": 1.135811799670707, "learning_rate": 3.896076194530876e-06, - "loss": 0.1821, + "loss": 0.1822, "step": 4173 }, { "epoch": 0.5709986320109439, - "grad_norm": 1.3321345957185018, + "grad_norm": 1.3270783858618522, "learning_rate": 3.893980492042673e-06, - "loss": 0.2041, + "loss": 0.2063, "step": 4174 }, { "epoch": 0.5711354309165527, - "grad_norm": 1.279843182194464, + "grad_norm": 1.2862260597598116, "learning_rate": 3.8918849938353744e-06, - "loss": 0.2003, + "loss": 0.2022, "step": 4175 }, { "epoch": 0.5712722298221614, - "grad_norm": 1.20163043627635, + "grad_norm": 1.215913848746828, "learning_rate": 3.889789700296019e-06, - "loss": 0.1723, + "loss": 0.1734, "step": 4176 }, { "epoch": 0.5714090287277702, - "grad_norm": 1.2047542298986724, + "grad_norm": 1.189402901837422, "learning_rate": 3.8876946118116025e-06, - "loss": 0.1879, + "loss": 0.1877, "step": 4177 }, { "epoch": 0.5715458276333789, - "grad_norm": 1.2852065220648257, + "grad_norm": 1.26598988420283, "learning_rate": 3.8855997287690885e-06, - "loss": 0.1799, + "loss": 0.1785, "step": 4178 }, { "epoch": 0.5716826265389877, - "grad_norm": 1.2847823784709742, + "grad_norm": 1.2782569922159759, "learning_rate": 3.883505051555398e-06, - "loss": 0.1917, + "loss": 0.1928, "step": 4179 }, { "epoch": 0.5718194254445964, - "grad_norm": 1.2251222451238981, + "grad_norm": 1.2412449937791847, "learning_rate": 3.881410580557417e-06, - "loss": 0.1885, + "loss": 0.1884, "step": 4180 }, { "epoch": 0.5719562243502052, - "grad_norm": 1.2681521203147585, + "grad_norm": 1.2670665006729738, "learning_rate": 3.879316316161991e-06, - "loss": 0.1803, + "loss": 0.1788, "step": 4181 }, { "epoch": 0.5720930232558139, - "grad_norm": 1.3219337012888734, + "grad_norm": 1.3098617376300168, "learning_rate": 3.8772222587559345e-06, - "loss": 0.1975, + "loss": 0.1987, "step": 4182 }, { "epoch": 0.5722298221614227, - "grad_norm": 1.208089798039982, + "grad_norm": 1.2090156831843577, "learning_rate": 3.875128408726012e-06, - "loss": 0.1845, + "loss": 0.1861, "step": 4183 }, { "epoch": 0.5723666210670315, - "grad_norm": 1.0646320794071902, + "grad_norm": 1.0498450942824442, "learning_rate": 3.873034766458959e-06, - "loss": 0.1631, + "loss": 0.1627, "step": 4184 }, { "epoch": 0.5725034199726402, - "grad_norm": 1.2928293989149189, + "grad_norm": 1.294369315129151, "learning_rate": 3.87094133234147e-06, - "loss": 0.2058, + "loss": 0.2072, "step": 4185 }, { "epoch": 0.572640218878249, - "grad_norm": 1.2189247743008644, + "grad_norm": 1.2068313132049038, "learning_rate": 3.8688481067601984e-06, - "loss": 0.1948, + "loss": 0.1949, "step": 4186 }, { "epoch": 0.5727770177838577, - "grad_norm": 1.4662207577196686, + "grad_norm": 1.4672775518587706, "learning_rate": 3.866755090101763e-06, - "loss": 0.1742, + "loss": 0.1749, "step": 4187 }, { "epoch": 0.5729138166894665, - "grad_norm": 1.4747384340718113, + "grad_norm": 1.4544514513405276, "learning_rate": 3.86466228275274e-06, - "loss": 0.1812, + "loss": 0.18, "step": 4188 }, { "epoch": 0.5730506155950752, - "grad_norm": 1.3044400390517727, + "grad_norm": 1.2907819162322032, "learning_rate": 3.862569685099674e-06, - "loss": 0.1985, + "loss": 0.1978, "step": 4189 }, { "epoch": 0.573187414500684, - "grad_norm": 1.3277414163076995, + "grad_norm": 1.2832552149485403, "learning_rate": 3.860477297529059e-06, - "loss": 0.1879, + "loss": 0.193, "step": 4190 }, { "epoch": 0.5733242134062928, - "grad_norm": 1.1121877736047565, + "grad_norm": 1.0974050576315402, "learning_rate": 3.8583851204273624e-06, - "loss": 0.1895, + "loss": 0.1891, "step": 4191 }, { "epoch": 0.5734610123119015, - "grad_norm": 1.060568406395234, + "grad_norm": 1.0623454021858665, "learning_rate": 3.856293154181009e-06, - "loss": 0.1675, + "loss": 0.1688, "step": 4192 }, { "epoch": 0.5735978112175103, - "grad_norm": 1.1515401065158215, + "grad_norm": 1.1357555119520248, "learning_rate": 3.854201399176378e-06, - "loss": 0.1602, + "loss": 0.157, "step": 4193 }, { "epoch": 0.573734610123119, - "grad_norm": 1.5336048354428078, + "grad_norm": 1.5351665858689507, "learning_rate": 3.852109855799818e-06, - "loss": 0.2152, + "loss": 0.2136, "step": 4194 }, { "epoch": 0.5738714090287278, - "grad_norm": 1.4699552605973683, + "grad_norm": 1.4270935039805421, "learning_rate": 3.850018524437636e-06, - "loss": 0.2326, + "loss": 0.2325, "step": 4195 }, { "epoch": 0.5740082079343365, - "grad_norm": 1.0992894524897074, + "grad_norm": 1.1178152375725317, "learning_rate": 3.847927405476097e-06, - "loss": 0.1573, + "loss": 0.1587, "step": 4196 }, { "epoch": 0.5741450068399453, - "grad_norm": 1.2096999261511348, + "grad_norm": 1.203360213072736, "learning_rate": 3.845836499301429e-06, - "loss": 0.1929, + "loss": 0.1908, "step": 4197 }, { "epoch": 0.574281805745554, - "grad_norm": 1.204547319436575, + "grad_norm": 1.1397113335520475, "learning_rate": 3.843745806299826e-06, - "loss": 0.1827, + "loss": 0.1764, "step": 4198 }, { "epoch": 0.5744186046511628, - "grad_norm": 1.2284450198343317, + "grad_norm": 1.2333483095538682, "learning_rate": 3.8416553268574285e-06, - "loss": 0.1591, + "loss": 0.16, "step": 4199 }, { "epoch": 0.5745554035567716, - "grad_norm": 1.4315105391656509, + "grad_norm": 1.4379026158244632, "learning_rate": 3.839565061360352e-06, - "loss": 0.1769, + "loss": 0.1765, "step": 4200 }, { "epoch": 0.5745554035567716, - "eval_loss": 0.18155881762504578, - "eval_runtime": 5.9147, - "eval_samples_per_second": 5.072, - "eval_steps_per_second": 1.353, + "eval_loss": 0.18139298260211945, + "eval_runtime": 5.922, + "eval_samples_per_second": 5.066, + "eval_steps_per_second": 1.351, "step": 4200 }, { "epoch": 0.5746922024623803, - "grad_norm": 1.4741960867093713, + "grad_norm": 1.4657817121255121, "learning_rate": 3.8374750101946675e-06, - "loss": 0.2147, + "loss": 0.2165, "step": 4201 }, { "epoch": 0.5748290013679891, - "grad_norm": 1.2402003723440471, + "grad_norm": 1.2382851828209942, "learning_rate": 3.835385173746402e-06, - "loss": 0.1572, + "loss": 0.1581, "step": 4202 }, { "epoch": 0.5749658002735978, - "grad_norm": 1.1954288429098137, + "grad_norm": 1.1984500359648238, "learning_rate": 3.833295552401549e-06, - "loss": 0.1884, + "loss": 0.1891, "step": 4203 }, { "epoch": 0.5751025991792066, - "grad_norm": 1.2336852531181697, + "grad_norm": 1.2130077358161362, "learning_rate": 3.831206146546059e-06, - "loss": 0.1987, + "loss": 0.1971, "step": 4204 }, { "epoch": 0.5752393980848153, - "grad_norm": 1.584251231098103, + "grad_norm": 1.5784003146044416, "learning_rate": 3.8291169565658455e-06, - "loss": 0.2119, + "loss": 0.2126, "step": 4205 }, { "epoch": 0.5753761969904241, - "grad_norm": 1.3312775800045273, + "grad_norm": 1.2840348193879096, "learning_rate": 3.827027982846778e-06, - "loss": 0.2064, + "loss": 0.2084, "step": 4206 }, { "epoch": 0.5755129958960329, - "grad_norm": 1.091765286844022, + "grad_norm": 1.0721807143485973, "learning_rate": 3.824939225774688e-06, - "loss": 0.1541, + "loss": 0.1533, "step": 4207 }, { "epoch": 0.5756497948016416, - "grad_norm": 1.3163754347676815, + "grad_norm": 1.3031565714071889, "learning_rate": 3.82285068573537e-06, - "loss": 0.2166, + "loss": 0.217, "step": 4208 }, { "epoch": 0.5757865937072504, - "grad_norm": 1.2394965649555507, + "grad_norm": 1.2170671677484834, "learning_rate": 3.820762363114572e-06, - "loss": 0.178, + "loss": 0.1807, "step": 4209 }, { "epoch": 0.5759233926128591, - "grad_norm": 1.4411564803513512, + "grad_norm": 1.4457478164074649, "learning_rate": 3.818674258298008e-06, - "loss": 0.1947, + "loss": 0.1954, "step": 4210 }, { "epoch": 0.5760601915184679, - "grad_norm": 1.1023809497318071, + "grad_norm": 1.0754370363812964, "learning_rate": 3.816586371671349e-06, - "loss": 0.1768, + "loss": 0.1753, "step": 4211 }, { "epoch": 0.5761969904240766, - "grad_norm": 1.2079467140867393, + "grad_norm": 1.178584667982025, "learning_rate": 3.8144987036202265e-06, - "loss": 0.1941, + "loss": 0.1919, "step": 4212 }, { "epoch": 0.5763337893296854, - "grad_norm": 1.2319107470589026, + "grad_norm": 1.222390827180888, "learning_rate": 3.812411254530228e-06, - "loss": 0.1767, + "loss": 0.1777, "step": 4213 }, { "epoch": 0.5764705882352941, - "grad_norm": 1.2272387166165835, + "grad_norm": 1.2102004909382564, "learning_rate": 3.8103240247869077e-06, - "loss": 0.1559, + "loss": 0.1555, "step": 4214 }, { "epoch": 0.5766073871409029, - "grad_norm": 1.0482491763455921, + "grad_norm": 1.0271845184996515, "learning_rate": 3.808237014775772e-06, - "loss": 0.1842, + "loss": 0.1837, "step": 4215 }, { "epoch": 0.5767441860465117, - "grad_norm": 1.0319870833620863, + "grad_norm": 0.9890766014363759, "learning_rate": 3.80615022488229e-06, - "loss": 0.1674, + "loss": 0.1664, "step": 4216 }, { "epoch": 0.5768809849521204, - "grad_norm": 1.1530516359781924, + "grad_norm": 1.141162341854418, "learning_rate": 3.8040636554918937e-06, - "loss": 0.1747, + "loss": 0.1764, "step": 4217 }, { "epoch": 0.5770177838577292, - "grad_norm": 1.3188667688482685, + "grad_norm": 1.332968084440811, "learning_rate": 3.8019773069899666e-06, - "loss": 0.1931, + "loss": 0.196, "step": 4218 }, { "epoch": 0.5771545827633379, - "grad_norm": 1.3724529921722715, + "grad_norm": 1.362872759621371, "learning_rate": 3.7998911797618597e-06, - "loss": 0.2344, + "loss": 0.2369, "step": 4219 }, { "epoch": 0.5772913816689467, - "grad_norm": 1.3170383460123711, + "grad_norm": 1.3194397314978092, "learning_rate": 3.797805274192875e-06, - "loss": 0.1744, + "loss": 0.1764, "step": 4220 }, { "epoch": 0.5774281805745554, - "grad_norm": 1.1291311259261403, + "grad_norm": 1.1210425428133226, "learning_rate": 3.7957195906682815e-06, - "loss": 0.1784, + "loss": 0.1787, "step": 4221 }, { "epoch": 0.5775649794801642, - "grad_norm": 1.2023500241578085, + "grad_norm": 1.1718834373941274, "learning_rate": 3.793634129573298e-06, - "loss": 0.1819, + "loss": 0.181, "step": 4222 }, { "epoch": 0.577701778385773, - "grad_norm": 1.266505670415187, + "grad_norm": 1.262180428719071, "learning_rate": 3.7915488912931116e-06, - "loss": 0.1987, + "loss": 0.1996, "step": 4223 }, { "epoch": 0.5778385772913817, - "grad_norm": 1.1648095505265106, + "grad_norm": 1.1520971801060362, "learning_rate": 3.7894638762128633e-06, - "loss": 0.1796, + "loss": 0.1799, "step": 4224 }, { "epoch": 0.5779753761969905, - "grad_norm": 1.2954696941883503, + "grad_norm": 1.2730585450946588, "learning_rate": 3.7873790847176528e-06, - "loss": 0.2083, + "loss": 0.2117, "step": 4225 }, { "epoch": 0.5781121751025992, - "grad_norm": 1.4790117505804625, + "grad_norm": 1.40689343129627, "learning_rate": 3.7852945171925415e-06, - "loss": 0.206, + "loss": 0.2062, "step": 4226 }, { "epoch": 0.578248974008208, - "grad_norm": 1.1997992903923493, + "grad_norm": 1.1899860062734828, "learning_rate": 3.7832101740225445e-06, - "loss": 0.16, + "loss": 0.1603, "step": 4227 }, { "epoch": 0.5783857729138167, - "grad_norm": 1.140146898200323, + "grad_norm": 1.129884948226344, "learning_rate": 3.781126055592641e-06, - "loss": 0.1652, + "loss": 0.1653, "step": 4228 }, { "epoch": 0.5785225718194255, - "grad_norm": 1.1494524826935746, + "grad_norm": 1.1438171836195716, "learning_rate": 3.7790421622877633e-06, - "loss": 0.1867, + "loss": 0.1871, "step": 4229 }, { "epoch": 0.5786593707250342, - "grad_norm": 1.4131666550140174, + "grad_norm": 1.4002241315407167, "learning_rate": 3.7769584944928067e-06, - "loss": 0.1949, + "loss": 0.1955, "step": 4230 }, { "epoch": 0.578796169630643, - "grad_norm": 1.1283483904580534, + "grad_norm": 1.1130297479847935, "learning_rate": 3.7748750525926226e-06, - "loss": 0.1448, + "loss": 0.1451, "step": 4231 }, { "epoch": 0.5789329685362518, - "grad_norm": 1.25326197665693, + "grad_norm": 1.2529463752704844, "learning_rate": 3.772791836972019e-06, - "loss": 0.2201, + "loss": 0.2189, "step": 4232 }, { "epoch": 0.5790697674418605, - "grad_norm": 1.2372263447728087, + "grad_norm": 1.2377939502355597, "learning_rate": 3.7707088480157676e-06, - "loss": 0.1635, + "loss": 0.1646, "step": 4233 }, { "epoch": 0.5792065663474693, - "grad_norm": 1.385910658601336, + "grad_norm": 1.368909479064393, "learning_rate": 3.7686260861085924e-06, "loss": 0.2161, "step": 4234 }, { "epoch": 0.579343365253078, - "grad_norm": 1.2701790356602627, + "grad_norm": 1.2638860611023364, "learning_rate": 3.7665435516351802e-06, - "loss": 0.1952, + "loss": 0.1972, "step": 4235 }, { "epoch": 0.5794801641586868, - "grad_norm": 1.3571381783348109, + "grad_norm": 1.3475006898060622, "learning_rate": 3.7644612449801693e-06, - "loss": 0.2173, + "loss": 0.2198, "step": 4236 }, { "epoch": 0.5796169630642954, - "grad_norm": 1.3016414258866935, + "grad_norm": 1.313590368687608, "learning_rate": 3.7623791665281646e-06, - "loss": 0.1976, + "loss": 0.1999, "step": 4237 }, { "epoch": 0.5797537619699042, - "grad_norm": 1.226117686412175, + "grad_norm": 1.226034078444593, "learning_rate": 3.76029731666372e-06, - "loss": 0.1797, + "loss": 0.1804, "step": 4238 }, { "epoch": 0.579890560875513, - "grad_norm": 1.3099840222502475, + "grad_norm": 1.2709890994011421, "learning_rate": 3.7582156957713545e-06, - "loss": 0.1887, + "loss": 0.1871, "step": 4239 }, { "epoch": 0.5800273597811217, - "grad_norm": 1.2949334135887607, + "grad_norm": 1.2433054013397797, "learning_rate": 3.7561343042355415e-06, - "loss": 0.1623, + "loss": 0.1596, "step": 4240 }, { "epoch": 0.5801641586867305, - "grad_norm": 1.0137371517762392, + "grad_norm": 0.9983771705855377, "learning_rate": 3.7540531424407107e-06, - "loss": 0.1659, + "loss": 0.1647, "step": 4241 }, { "epoch": 0.5803009575923392, - "grad_norm": 1.331874618877039, + "grad_norm": 1.3269113278998703, "learning_rate": 3.751972210771254e-06, - "loss": 0.2002, + "loss": 0.2046, "step": 4242 }, { "epoch": 0.580437756497948, - "grad_norm": 1.3900867001389319, + "grad_norm": 1.383767130519837, "learning_rate": 3.749891509611514e-06, - "loss": 0.1837, + "loss": 0.1831, "step": 4243 }, { "epoch": 0.5805745554035567, - "grad_norm": 1.2099313286647644, + "grad_norm": 1.2085014068259463, "learning_rate": 3.7478110393457985e-06, - "loss": 0.1823, + "loss": 0.1833, "step": 4244 }, { "epoch": 0.5807113543091655, - "grad_norm": 1.6458167181258807, + "grad_norm": 1.6013714360062288, "learning_rate": 3.7457308003583643e-06, - "loss": 0.2442, + "loss": 0.2397, "step": 4245 }, { "epoch": 0.5808481532147742, - "grad_norm": 1.226115847320479, + "grad_norm": 1.2116276650352125, "learning_rate": 3.743650793033433e-06, - "loss": 0.1839, + "loss": 0.1822, "step": 4246 }, { "epoch": 0.580984952120383, - "grad_norm": 1.3718898658866616, + "grad_norm": 1.3652761478088047, "learning_rate": 3.741571017755179e-06, "loss": 0.205, "step": 4247 }, { "epoch": 0.5811217510259918, - "grad_norm": 1.3633607127773604, + "grad_norm": 1.364815205940349, "learning_rate": 3.739491474907735e-06, - "loss": 0.1793, + "loss": 0.18, "step": 4248 }, { "epoch": 0.5812585499316005, - "grad_norm": 1.039691812367506, + "grad_norm": 1.029848469266396, "learning_rate": 3.7374121648751926e-06, - "loss": 0.1527, + "loss": 0.1539, "step": 4249 }, { "epoch": 0.5813953488372093, - "grad_norm": 1.2712383803034508, + "grad_norm": 1.3083896566264406, "learning_rate": 3.7353330880415963e-06, - "loss": 0.2178, + "loss": 0.2207, "step": 4250 }, { "epoch": 0.581532147742818, - "grad_norm": 1.0147995624721406, + "grad_norm": 1.0180532956747455, "learning_rate": 3.7332542447909525e-06, - "loss": 0.1804, + "loss": 0.181, "step": 4251 }, { "epoch": 0.5816689466484268, - "grad_norm": 1.2484597840948035, + "grad_norm": 1.2513078785482532, "learning_rate": 3.731175635507219e-06, - "loss": 0.1741, + "loss": 0.1744, "step": 4252 }, { "epoch": 0.5818057455540355, - "grad_norm": 1.2107497248556958, + "grad_norm": 1.2226020157403972, "learning_rate": 3.7290972605743153e-06, - "loss": 0.1905, + "loss": 0.1923, "step": 4253 }, { "epoch": 0.5819425444596443, - "grad_norm": 1.3163045565497526, + "grad_norm": 1.308284583133585, "learning_rate": 3.727019120376114e-06, - "loss": 0.194, + "loss": 0.1962, "step": 4254 }, { "epoch": 0.5820793433652531, - "grad_norm": 1.1021678575437688, + "grad_norm": 1.0953769122208892, "learning_rate": 3.7249412152964465e-06, - "loss": 0.1701, + "loss": 0.1691, "step": 4255 }, { "epoch": 0.5822161422708618, - "grad_norm": 1.1939618015339297, + "grad_norm": 1.2011942320807605, "learning_rate": 3.7228635457191028e-06, - "loss": 0.1712, + "loss": 0.1716, "step": 4256 }, { "epoch": 0.5823529411764706, - "grad_norm": 1.3019067211708897, + "grad_norm": 1.319448961687473, "learning_rate": 3.720786112027822e-06, - "loss": 0.1743, + "loss": 0.1752, "step": 4257 }, { "epoch": 0.5824897400820793, - "grad_norm": 1.468711751256746, + "grad_norm": 1.457644999067365, "learning_rate": 3.718708914606309e-06, - "loss": 0.2441, + "loss": 0.2452, "step": 4258 }, { "epoch": 0.5826265389876881, - "grad_norm": 1.1834007759165175, + "grad_norm": 1.1699863614791655, "learning_rate": 3.716631953838217e-06, - "loss": 0.163, + "loss": 0.1605, "step": 4259 }, { "epoch": 0.5827633378932968, - "grad_norm": 0.7744878921604113, + "grad_norm": 0.7674686370560783, "learning_rate": 3.71455523010716e-06, - "loss": 0.1256, + "loss": 0.1248, "step": 4260 }, { "epoch": 0.5829001367989056, - "grad_norm": 1.3571756232124208, + "grad_norm": 1.3602527011018022, "learning_rate": 3.712478743796707e-06, - "loss": 0.2017, + "loss": 0.2035, "step": 4261 }, { "epoch": 0.5830369357045143, - "grad_norm": 1.3719274241156398, + "grad_norm": 1.3659558320313818, "learning_rate": 3.710402495290386e-06, - "loss": 0.2213, + "loss": 0.2195, "step": 4262 }, { "epoch": 0.5831737346101231, - "grad_norm": 1.3841571737896377, + "grad_norm": 1.369013321360291, "learning_rate": 3.7083264849716743e-06, - "loss": 0.1609, + "loss": 0.1608, "step": 4263 }, { "epoch": 0.5833105335157319, - "grad_norm": 1.1843501680149149, + "grad_norm": 1.188747941461714, "learning_rate": 3.7062507132240112e-06, - "loss": 0.1583, + "loss": 0.1584, "step": 4264 }, { "epoch": 0.5834473324213406, - "grad_norm": 1.3594552927859713, + "grad_norm": 1.3824588213381581, "learning_rate": 3.7041751804307923e-06, - "loss": 0.2028, + "loss": 0.2041, "step": 4265 }, { "epoch": 0.5835841313269494, - "grad_norm": 1.0206390982395657, + "grad_norm": 1.012001492135155, "learning_rate": 3.702099886975362e-06, - "loss": 0.1786, + "loss": 0.1792, "step": 4266 }, { "epoch": 0.5837209302325581, - "grad_norm": 1.313334303580546, + "grad_norm": 1.3081990052174637, "learning_rate": 3.7000248332410303e-06, - "loss": 0.1849, + "loss": 0.1835, "step": 4267 }, { "epoch": 0.5838577291381669, - "grad_norm": 1.2281140008127507, + "grad_norm": 1.2157595097935532, "learning_rate": 3.697950019611054e-06, - "loss": 0.186, + "loss": 0.1871, "step": 4268 }, { "epoch": 0.5839945280437756, - "grad_norm": 1.1091343527132367, + "grad_norm": 1.1081174719165878, "learning_rate": 3.695875446468651e-06, - "loss": 0.1712, + "loss": 0.1724, "step": 4269 }, { "epoch": 0.5841313269493844, - "grad_norm": 1.11381939087296, + "grad_norm": 1.0842338006043744, "learning_rate": 3.6938011141969933e-06, - "loss": 0.177, + "loss": 0.1759, "step": 4270 }, { "epoch": 0.5842681258549932, - "grad_norm": 1.2130786788533756, + "grad_norm": 1.1925511453240047, "learning_rate": 3.691727023179208e-06, - "loss": 0.1656, + "loss": 0.1637, "step": 4271 }, { "epoch": 0.5844049247606019, - "grad_norm": 1.0418127576608291, + "grad_norm": 1.0243812460683563, "learning_rate": 3.689653173798381e-06, - "loss": 0.145, + "loss": 0.144, "step": 4272 }, { "epoch": 0.5845417236662107, - "grad_norm": 1.277106208273908, + "grad_norm": 1.2844199540362675, "learning_rate": 3.6875795664375453e-06, - "loss": 0.1918, + "loss": 0.1928, "step": 4273 }, { "epoch": 0.5846785225718194, - "grad_norm": 1.4708039991040618, + "grad_norm": 1.3503829154539047, "learning_rate": 3.6855062014796994e-06, - "loss": 0.199, + "loss": 0.1978, "step": 4274 }, { "epoch": 0.5848153214774282, - "grad_norm": 1.2782180878278637, + "grad_norm": 1.2457603714335508, "learning_rate": 3.6834330793077876e-06, - "loss": 0.2059, + "loss": 0.2035, "step": 4275 }, { "epoch": 0.5849521203830369, - "grad_norm": 1.3181063715925538, + "grad_norm": 1.326795344040912, "learning_rate": 3.6813602003047185e-06, "loss": 0.1989, "step": 4276 }, { "epoch": 0.5850889192886457, - "grad_norm": 1.3238621293944635, + "grad_norm": 1.3430381413111225, "learning_rate": 3.6792875648533466e-06, - "loss": 0.1899, + "loss": 0.1926, "step": 4277 }, { "epoch": 0.5852257181942544, - "grad_norm": 1.229072172208604, + "grad_norm": 1.2238458005509871, "learning_rate": 3.6772151733364915e-06, - "loss": 0.2005, + "loss": 0.1997, "step": 4278 }, { "epoch": 0.5853625170998632, - "grad_norm": 1.097223809157109, + "grad_norm": 1.0975514576013492, "learning_rate": 3.675143026136916e-06, - "loss": 0.1467, + "loss": 0.1474, "step": 4279 }, { "epoch": 0.585499316005472, - "grad_norm": 1.1085237421735468, + "grad_norm": 1.0968194815017598, "learning_rate": 3.6730711236373474e-06, - "loss": 0.1754, + "loss": 0.1759, "step": 4280 }, { "epoch": 0.5856361149110807, - "grad_norm": 1.2996960327235856, + "grad_norm": 1.277953802793092, "learning_rate": 3.6709994662204662e-06, - "loss": 0.2332, + "loss": 0.2297, "step": 4281 }, { "epoch": 0.5857729138166895, - "grad_norm": 1.1591871578041122, + "grad_norm": 1.151984844692615, "learning_rate": 3.668928054268901e-06, - "loss": 0.2156, + "loss": 0.2144, "step": 4282 }, { "epoch": 0.5859097127222982, - "grad_norm": 1.8164650944386453, + "grad_norm": 1.7489964784658454, "learning_rate": 3.6668568881652446e-06, - "loss": 0.2374, + "loss": 0.2312, "step": 4283 }, { "epoch": 0.586046511627907, - "grad_norm": 1.241374004147804, + "grad_norm": 1.2392452262165872, "learning_rate": 3.664785968292036e-06, - "loss": 0.1941, + "loss": 0.1969, "step": 4284 }, { "epoch": 0.5861833105335157, - "grad_norm": 1.4003897058928227, + "grad_norm": 1.409372799220218, "learning_rate": 3.6627152950317762e-06, - "loss": 0.2143, + "loss": 0.2146, "step": 4285 }, { "epoch": 0.5863201094391245, - "grad_norm": 1.2052587606545628, + "grad_norm": 1.193383678426858, "learning_rate": 3.660644868766913e-06, - "loss": 0.1832, + "loss": 0.1816, "step": 4286 }, { "epoch": 0.5864569083447333, - "grad_norm": 1.4836738635501598, + "grad_norm": 1.4876843226860825, "learning_rate": 3.6585746898798533e-06, - "loss": 0.2314, + "loss": 0.2296, "step": 4287 }, { "epoch": 0.586593707250342, - "grad_norm": 1.0287131324328114, + "grad_norm": 1.02548460864398, "learning_rate": 3.6565047587529616e-06, - "loss": 0.1829, + "loss": 0.1852, "step": 4288 }, { "epoch": 0.5867305061559508, - "grad_norm": 1.2708222172441448, + "grad_norm": 1.2744183900692208, "learning_rate": 3.6544350757685464e-06, - "loss": 0.2119, + "loss": 0.2131, "step": 4289 }, { "epoch": 0.5868673050615595, - "grad_norm": 1.264863701695704, + "grad_norm": 1.257964850216594, "learning_rate": 3.652365641308881e-06, - "loss": 0.1844, + "loss": 0.1832, "step": 4290 }, { "epoch": 0.5870041039671683, - "grad_norm": 1.1423931408285308, + "grad_norm": 1.1281802962834933, "learning_rate": 3.6502964557561852e-06, - "loss": 0.1574, + "loss": 0.158, "step": 4291 }, { "epoch": 0.587140902872777, - "grad_norm": 1.3456545873420844, + "grad_norm": 1.3377090825086548, "learning_rate": 3.64822751949264e-06, - "loss": 0.1806, + "loss": 0.18, "step": 4292 }, { "epoch": 0.5872777017783858, - "grad_norm": 1.352377091941991, + "grad_norm": 1.3642367224732481, "learning_rate": 3.6461588329003707e-06, - "loss": 0.2138, + "loss": 0.2143, "step": 4293 }, { "epoch": 0.5874145006839945, - "grad_norm": 1.1478176162882265, + "grad_norm": 1.1325627310684516, "learning_rate": 3.644090396361467e-06, - "loss": 0.1812, + "loss": 0.1809, "step": 4294 }, { "epoch": 0.5875512995896033, - "grad_norm": 1.450674427121925, + "grad_norm": 1.4251964236168864, "learning_rate": 3.6420222102579634e-06, - "loss": 0.195, + "loss": 0.1933, "step": 4295 }, { "epoch": 0.5876880984952121, - "grad_norm": 1.4925343179415123, + "grad_norm": 1.3993363142794697, "learning_rate": 3.639954274971854e-06, - "loss": 0.2073, + "loss": 0.2066, "step": 4296 }, { "epoch": 0.5878248974008208, - "grad_norm": 1.2893416247250156, + "grad_norm": 1.2864403443704489, "learning_rate": 3.6378865908850857e-06, - "loss": 0.1834, + "loss": 0.1846, "step": 4297 }, { "epoch": 0.5879616963064296, - "grad_norm": 1.203255381757867, + "grad_norm": 1.1741220443404186, "learning_rate": 3.635819158379558e-06, - "loss": 0.1535, + "loss": 0.1529, "step": 4298 }, { "epoch": 0.5880984952120383, - "grad_norm": 1.2448433386396585, + "grad_norm": 1.2254336938554014, "learning_rate": 3.6337519778371223e-06, - "loss": 0.1942, + "loss": 0.1948, "step": 4299 }, { "epoch": 0.5882352941176471, - "grad_norm": 1.2561354733393137, + "grad_norm": 1.2473490262567362, "learning_rate": 3.6316850496395863e-06, - "loss": 0.2005, + "loss": 0.2, "step": 4300 }, { "epoch": 0.5882352941176471, - "eval_loss": 0.18216826021671295, - "eval_runtime": 5.9051, - "eval_samples_per_second": 5.08, - "eval_steps_per_second": 1.355, + "eval_loss": 0.1818574070930481, + "eval_runtime": 5.9319, + "eval_samples_per_second": 5.057, + "eval_steps_per_second": 1.349, "step": 4300 }, { "epoch": 0.5883720930232558, - "grad_norm": 1.1595359249562196, + "grad_norm": 1.1661421945341772, "learning_rate": 3.6296183741687106e-06, - "loss": 0.1696, + "loss": 0.1708, "step": 4301 }, { "epoch": 0.5885088919288646, - "grad_norm": 1.1717967119848116, + "grad_norm": 1.158473364146616, "learning_rate": 3.6275519518062065e-06, - "loss": 0.1605, + "loss": 0.1595, "step": 4302 }, { "epoch": 0.5886456908344734, - "grad_norm": 1.2805871926071346, + "grad_norm": 1.265553434464519, "learning_rate": 3.625485782933741e-06, - "loss": 0.1642, + "loss": 0.1639, "step": 4303 }, { "epoch": 0.5887824897400821, - "grad_norm": 1.3755046381477052, + "grad_norm": 1.3683914592813502, "learning_rate": 3.623419867932937e-06, - "loss": 0.1676, + "loss": 0.1692, "step": 4304 }, { "epoch": 0.5889192886456909, - "grad_norm": 1.3814032637153717, + "grad_norm": 1.3470834739587698, "learning_rate": 3.6213542071853623e-06, - "loss": 0.2044, + "loss": 0.2016, "step": 4305 }, { "epoch": 0.5890560875512996, - "grad_norm": 1.326942675630151, + "grad_norm": 1.3061398518401175, "learning_rate": 3.619288801072547e-06, - "loss": 0.1775, + "loss": 0.1762, "step": 4306 }, { "epoch": 0.5891928864569084, - "grad_norm": 1.189596166584337, + "grad_norm": 1.117688350241408, "learning_rate": 3.617223649975966e-06, - "loss": 0.1804, + "loss": 0.1792, "step": 4307 }, { "epoch": 0.5893296853625171, - "grad_norm": 1.2806353983302832, + "grad_norm": 1.273888410325555, "learning_rate": 3.6151587542770567e-06, - "loss": 0.1991, + "loss": 0.1995, "step": 4308 }, { "epoch": 0.5894664842681259, - "grad_norm": 1.1771564898988789, + "grad_norm": 1.1583564577684502, "learning_rate": 3.6130941143571974e-06, - "loss": 0.2006, + "loss": 0.1995, "step": 4309 }, { "epoch": 0.5896032831737346, - "grad_norm": 1.378465708499296, + "grad_norm": 1.3774110186040596, "learning_rate": 3.6110297305977303e-06, - "loss": 0.1897, + "loss": 0.1908, "step": 4310 }, { "epoch": 0.5897400820793434, - "grad_norm": 1.1565516716777953, + "grad_norm": 1.1246921088177764, "learning_rate": 3.6089656033799412e-06, - "loss": 0.1827, + "loss": 0.1813, "step": 4311 }, { "epoch": 0.5898768809849522, - "grad_norm": 1.3755727020754989, + "grad_norm": 1.3706579954510576, "learning_rate": 3.606901733085075e-06, - "loss": 0.1961, + "loss": 0.196, "step": 4312 }, { "epoch": 0.5900136798905609, - "grad_norm": 1.121475829009891, + "grad_norm": 1.1176570879467553, "learning_rate": 3.604838120094328e-06, - "loss": 0.1781, + "loss": 0.1789, "step": 4313 }, { "epoch": 0.5901504787961697, - "grad_norm": 1.2199649659673095, + "grad_norm": 1.1931267085380661, "learning_rate": 3.602774764788845e-06, - "loss": 0.2102, + "loss": 0.2064, "step": 4314 }, { "epoch": 0.5902872777017784, - "grad_norm": 1.2123696957315, + "grad_norm": 1.1984332022093094, "learning_rate": 3.60071166754973e-06, - "loss": 0.1787, + "loss": 0.1789, "step": 4315 }, { "epoch": 0.5904240766073872, - "grad_norm": 1.1931549453128407, + "grad_norm": 1.1911692674540937, "learning_rate": 3.598648828758031e-06, - "loss": 0.1645, + "loss": 0.1646, "step": 4316 }, { "epoch": 0.5905608755129959, - "grad_norm": 0.9584537243571745, + "grad_norm": 0.9765162546166467, "learning_rate": 3.5965862487947567e-06, - "loss": 0.1436, + "loss": 0.1452, "step": 4317 }, { "epoch": 0.5906976744186047, - "grad_norm": 1.3045375230464036, + "grad_norm": 1.3370494854358772, "learning_rate": 3.5945239280408596e-06, - "loss": 0.2108, + "loss": 0.2127, "step": 4318 }, { "epoch": 0.5908344733242135, - "grad_norm": 1.0915909203196346, + "grad_norm": 1.083508974764094, "learning_rate": 3.5924618668772503e-06, - "loss": 0.1548, + "loss": 0.1573, "step": 4319 }, { "epoch": 0.5909712722298222, - "grad_norm": 1.4752314299384939, + "grad_norm": 1.4694131990460773, "learning_rate": 3.5904000656847928e-06, - "loss": 0.219, + "loss": 0.216, "step": 4320 }, { "epoch": 0.591108071135431, - "grad_norm": 1.2676944054863264, + "grad_norm": 1.2767235444915226, "learning_rate": 3.5883385248442958e-06, - "loss": 0.194, + "loss": 0.1925, "step": 4321 }, { "epoch": 0.5912448700410396, - "grad_norm": 1.3351583200197243, + "grad_norm": 1.3259261400387263, "learning_rate": 3.5862772447365285e-06, - "loss": 0.1923, + "loss": 0.1924, "step": 4322 }, { "epoch": 0.5913816689466485, - "grad_norm": 1.173485950926141, + "grad_norm": 1.1529549436239428, "learning_rate": 3.584216225742203e-06, - "loss": 0.1792, + "loss": 0.1785, "step": 4323 }, { "epoch": 0.5915184678522571, - "grad_norm": 1.1719156907335844, + "grad_norm": 1.1448270716909232, "learning_rate": 3.582155468241993e-06, - "loss": 0.1953, + "loss": 0.1944, "step": 4324 }, { "epoch": 0.591655266757866, - "grad_norm": 1.1951244064063822, + "grad_norm": 1.2075004111727075, "learning_rate": 3.5800949726165136e-06, - "loss": 0.1814, + "loss": 0.1839, "step": 4325 }, { "epoch": 0.5917920656634746, - "grad_norm": 1.5637521760099764, + "grad_norm": 1.5472344514983731, "learning_rate": 3.5780347392463394e-06, - "loss": 0.2548, + "loss": 0.2523, "step": 4326 }, { "epoch": 0.5919288645690834, - "grad_norm": 1.1929022199616852, + "grad_norm": 1.1854164548939825, "learning_rate": 3.5759747685119926e-06, - "loss": 0.1803, + "loss": 0.1804, "step": 4327 }, { "epoch": 0.5920656634746922, - "grad_norm": 1.186682488443684, + "grad_norm": 1.1780103441141974, "learning_rate": 3.5739150607939487e-06, - "loss": 0.2029, + "loss": 0.2035, "step": 4328 }, { "epoch": 0.5922024623803009, - "grad_norm": 1.3299157455722366, + "grad_norm": 1.3453381066686345, "learning_rate": 3.571855616472637e-06, - "loss": 0.1928, + "loss": 0.1975, "step": 4329 }, { "epoch": 0.5923392612859097, - "grad_norm": 1.212423453864614, + "grad_norm": 1.206348684363832, "learning_rate": 3.5697964359284303e-06, - "loss": 0.1844, + "loss": 0.1835, "step": 4330 }, { "epoch": 0.5924760601915184, - "grad_norm": 1.1317292726648527, + "grad_norm": 1.1318340084232992, "learning_rate": 3.567737519541662e-06, - "loss": 0.199, + "loss": 0.2018, "step": 4331 }, { "epoch": 0.5926128590971272, - "grad_norm": 1.3746222712750027, + "grad_norm": 1.3701005012784038, "learning_rate": 3.5656788676926067e-06, "loss": 0.195, "step": 4332 }, { "epoch": 0.5927496580027359, - "grad_norm": 1.0648400874079607, + "grad_norm": 1.0545135963380958, "learning_rate": 3.5636204807615015e-06, - "loss": 0.1516, + "loss": 0.1523, "step": 4333 }, { "epoch": 0.5928864569083447, - "grad_norm": 1.2963123210882515, + "grad_norm": 1.2989874981457001, "learning_rate": 3.561562359128526e-06, - "loss": 0.1816, + "loss": 0.1825, "step": 4334 }, { "epoch": 0.5930232558139535, - "grad_norm": 1.1855844620000753, + "grad_norm": 1.183069315214231, "learning_rate": 3.5595045031738123e-06, - "loss": 0.1934, + "loss": 0.1931, "step": 4335 }, { "epoch": 0.5931600547195622, - "grad_norm": 1.256777917492871, + "grad_norm": 1.2740343517480257, "learning_rate": 3.557446913277448e-06, - "loss": 0.1907, + "loss": 0.193, "step": 4336 }, { "epoch": 0.593296853625171, - "grad_norm": 1.4604156248264737, + "grad_norm": 1.4562730828223691, "learning_rate": 3.5553895898194655e-06, - "loss": 0.213, + "loss": 0.2132, "step": 4337 }, { "epoch": 0.5934336525307797, - "grad_norm": 1.330197170001677, + "grad_norm": 1.3208020556509417, "learning_rate": 3.553332533179854e-06, - "loss": 0.22, + "loss": 0.2183, "step": 4338 }, { "epoch": 0.5935704514363885, - "grad_norm": 1.236967406361948, + "grad_norm": 1.25253363401978, "learning_rate": 3.551275743738546e-06, - "loss": 0.1901, + "loss": 0.1927, "step": 4339 }, { "epoch": 0.5937072503419972, - "grad_norm": 1.0859544910498917, + "grad_norm": 1.0793739776087465, "learning_rate": 3.5492192218754327e-06, - "loss": 0.1754, + "loss": 0.1753, "step": 4340 }, { "epoch": 0.593844049247606, - "grad_norm": 1.5500320280758773, + "grad_norm": 1.5322792704179438, "learning_rate": 3.547162967970348e-06, - "loss": 0.2329, + "loss": 0.2306, "step": 4341 }, { "epoch": 0.5939808481532147, - "grad_norm": 1.420864436015918, + "grad_norm": 1.4198957151154858, "learning_rate": 3.5451069824030838e-06, - "loss": 0.2195, + "loss": 0.2192, "step": 4342 }, { "epoch": 0.5941176470588235, - "grad_norm": 1.2554550711152717, + "grad_norm": 1.2698396873800326, "learning_rate": 3.5430512655533774e-06, - "loss": 0.1952, + "loss": 0.1939, "step": 4343 }, { "epoch": 0.5942544459644323, - "grad_norm": 1.1810738900489417, + "grad_norm": 1.1587812145690455, "learning_rate": 3.5409958178009175e-06, - "loss": 0.1853, + "loss": 0.184, "step": 4344 }, { "epoch": 0.594391244870041, - "grad_norm": 1.4042399072060952, + "grad_norm": 1.3949495868758721, "learning_rate": 3.5389406395253466e-06, - "loss": 0.1856, + "loss": 0.1867, "step": 4345 }, { "epoch": 0.5945280437756498, - "grad_norm": 1.1877467559998736, + "grad_norm": 1.202503908426977, "learning_rate": 3.536885731106251e-06, - "loss": 0.2066, + "loss": 0.2081, "step": 4346 }, { "epoch": 0.5946648426812585, - "grad_norm": 1.1996787567752387, + "grad_norm": 1.203857256352914, "learning_rate": 3.5348310929231737e-06, - "loss": 0.1884, + "loss": 0.1902, "step": 4347 }, { "epoch": 0.5948016415868673, - "grad_norm": 1.2523172554159199, + "grad_norm": 1.2331964186225715, "learning_rate": 3.532776725355601e-06, - "loss": 0.1965, + "loss": 0.1956, "step": 4348 }, { "epoch": 0.594938440492476, - "grad_norm": 1.1515920946857683, + "grad_norm": 1.1481358256522156, "learning_rate": 3.5307226287829755e-06, - "loss": 0.1647, + "loss": 0.165, "step": 4349 }, { "epoch": 0.5950752393980848, - "grad_norm": 1.128990141614416, + "grad_norm": 1.1450674465436712, "learning_rate": 3.5286688035846863e-06, - "loss": 0.1898, + "loss": 0.1911, "step": 4350 }, { "epoch": 0.5952120383036936, - "grad_norm": 1.1722230674021812, + "grad_norm": 1.1600497270750676, "learning_rate": 3.526615250140073e-06, - "loss": 0.1762, + "loss": 0.1757, "step": 4351 }, { "epoch": 0.5953488372093023, - "grad_norm": 1.055053373356056, + "grad_norm": 1.0172576007324157, "learning_rate": 3.5245619688284277e-06, - "loss": 0.1767, + "loss": 0.1755, "step": 4352 }, { "epoch": 0.5954856361149111, - "grad_norm": 1.128811518269174, + "grad_norm": 1.1279923328359207, "learning_rate": 3.5225089600289864e-06, - "loss": 0.1559, + "loss": 0.1553, "step": 4353 }, { "epoch": 0.5956224350205198, - "grad_norm": 1.2529190404304795, + "grad_norm": 1.2274504981329337, "learning_rate": 3.5204562241209407e-06, - "loss": 0.1846, + "loss": 0.1856, "step": 4354 }, { "epoch": 0.5957592339261286, - "grad_norm": 1.009751065155822, + "grad_norm": 1.0020854504031953, "learning_rate": 3.5184037614834267e-06, - "loss": 0.1493, + "loss": 0.1486, "step": 4355 }, { "epoch": 0.5958960328317373, - "grad_norm": 1.355671875020765, + "grad_norm": 1.3546682008591362, "learning_rate": 3.5163515724955354e-06, - "loss": 0.1536, + "loss": 0.154, "step": 4356 }, { "epoch": 0.5960328317373461, - "grad_norm": 1.0698294145949292, + "grad_norm": 1.0612648346748927, "learning_rate": 3.5142996575363013e-06, - "loss": 0.1616, + "loss": 0.1635, "step": 4357 }, { "epoch": 0.5961696306429548, - "grad_norm": 1.3073612540289863, + "grad_norm": 1.2898615257018555, "learning_rate": 3.512248016984716e-06, - "loss": 0.1795, + "loss": 0.1772, "step": 4358 }, { "epoch": 0.5963064295485636, - "grad_norm": 1.3891892474406826, + "grad_norm": 1.3766284482790474, "learning_rate": 3.5101966512197107e-06, - "loss": 0.1883, + "loss": 0.188, "step": 4359 }, { "epoch": 0.5964432284541724, - "grad_norm": 1.2023711536810207, + "grad_norm": 1.1763034888757697, "learning_rate": 3.508145560620173e-06, - "loss": 0.1897, + "loss": 0.1856, "step": 4360 }, { "epoch": 0.5965800273597811, - "grad_norm": 1.373106669274894, + "grad_norm": 1.358235616229322, "learning_rate": 3.506094745564941e-06, - "loss": 0.1912, + "loss": 0.1889, "step": 4361 }, { "epoch": 0.5967168262653899, - "grad_norm": 1.0321082941139457, + "grad_norm": 1.026147802932694, "learning_rate": 3.5040442064327922e-06, - "loss": 0.1659, + "loss": 0.1641, "step": 4362 }, { "epoch": 0.5968536251709986, - "grad_norm": 1.348481044752726, + "grad_norm": 1.3491798225138538, "learning_rate": 3.5019939436024647e-06, - "loss": 0.1741, + "loss": 0.173, "step": 4363 }, { "epoch": 0.5969904240766074, - "grad_norm": 1.2408024028415, + "grad_norm": 1.2464927331710682, "learning_rate": 3.4999439574526373e-06, - "loss": 0.1839, + "loss": 0.1851, "step": 4364 }, { "epoch": 0.5971272229822161, - "grad_norm": 1.1387844548953379, + "grad_norm": 1.118195173937635, "learning_rate": 3.4978942483619443e-06, - "loss": 0.1761, + "loss": 0.1756, "step": 4365 }, { "epoch": 0.5972640218878249, - "grad_norm": 1.2331626356659637, + "grad_norm": 1.226751865357267, "learning_rate": 3.495844816708961e-06, - "loss": 0.1835, + "loss": 0.1856, "step": 4366 }, { "epoch": 0.5974008207934337, - "grad_norm": 1.1223201381189678, + "grad_norm": 1.1153230145357895, "learning_rate": 3.4937956628722183e-06, - "loss": 0.155, + "loss": 0.1564, "step": 4367 }, { "epoch": 0.5975376196990424, - "grad_norm": 1.1589205370397146, + "grad_norm": 1.1206437516816092, "learning_rate": 3.4917467872301934e-06, - "loss": 0.1805, + "loss": 0.1754, "step": 4368 }, { "epoch": 0.5976744186046512, - "grad_norm": 1.2993676278411261, + "grad_norm": 1.3004680378642, "learning_rate": 3.4896981901613105e-06, - "loss": 0.1847, + "loss": 0.1834, "step": 4369 }, { "epoch": 0.5978112175102599, - "grad_norm": 1.3528611687271817, + "grad_norm": 1.3504614222525764, "learning_rate": 3.4876498720439457e-06, - "loss": 0.1985, + "loss": 0.1992, "step": 4370 }, { "epoch": 0.5979480164158687, - "grad_norm": 1.0404801007344546, + "grad_norm": 1.0364759384160278, "learning_rate": 3.48560183325642e-06, - "loss": 0.1751, + "loss": 0.1744, "step": 4371 }, { "epoch": 0.5980848153214774, - "grad_norm": 1.1604392302268187, + "grad_norm": 1.1245216111630059, "learning_rate": 3.483554074177006e-06, - "loss": 0.1652, + "loss": 0.162, "step": 4372 }, { "epoch": 0.5982216142270862, - "grad_norm": 1.21316708389669, + "grad_norm": 1.2058195626085888, "learning_rate": 3.48150659518392e-06, - "loss": 0.2005, + "loss": 0.1977, "step": 4373 }, { "epoch": 0.5983584131326949, - "grad_norm": 1.4473150464838043, + "grad_norm": 1.4227426889322854, "learning_rate": 3.4794593966553346e-06, - "loss": 0.1813, + "loss": 0.1807, "step": 4374 }, { "epoch": 0.5984952120383037, - "grad_norm": 1.2103025137250882, + "grad_norm": 1.1799262881574162, "learning_rate": 3.4774124789693615e-06, - "loss": 0.1556, + "loss": 0.1523, "step": 4375 }, { "epoch": 0.5986320109439125, - "grad_norm": 1.3287829221086493, + "grad_norm": 1.3108253376474004, "learning_rate": 3.475365842504066e-06, - "loss": 0.2162, + "loss": 0.2175, "step": 4376 }, { "epoch": 0.5987688098495212, - "grad_norm": 1.2776489445351311, + "grad_norm": 1.271173263134038, "learning_rate": 3.473319487637462e-06, - "loss": 0.217, + "loss": 0.2163, "step": 4377 }, { "epoch": 0.59890560875513, - "grad_norm": 1.117218146802873, + "grad_norm": 1.120271705244754, "learning_rate": 3.471273414747507e-06, - "loss": 0.1438, + "loss": 0.1446, "step": 4378 }, { "epoch": 0.5990424076607387, - "grad_norm": 1.2765167390950198, + "grad_norm": 1.2657410540575114, "learning_rate": 3.46922762421211e-06, - "loss": 0.1814, + "loss": 0.1811, "step": 4379 }, { "epoch": 0.5991792065663475, - "grad_norm": 1.4013361904387764, + "grad_norm": 1.4120844675865236, "learning_rate": 3.467182116409127e-06, - "loss": 0.1499, + "loss": 0.1514, "step": 4380 }, { "epoch": 0.5993160054719562, - "grad_norm": 1.2327368632959856, + "grad_norm": 1.2164105123109055, "learning_rate": 3.465136891716363e-06, - "loss": 0.1771, + "loss": 0.1779, "step": 4381 }, { "epoch": 0.599452804377565, - "grad_norm": 1.1702931769934486, + "grad_norm": 1.1579276994871164, "learning_rate": 3.463091950511567e-06, - "loss": 0.1702, + "loss": 0.1693, "step": 4382 }, { "epoch": 0.5995896032831738, - "grad_norm": 1.2674848435556816, + "grad_norm": 1.2623192484180232, "learning_rate": 3.461047293172437e-06, - "loss": 0.1694, + "loss": 0.1648, "step": 4383 }, { "epoch": 0.5997264021887825, - "grad_norm": 1.3724036823967594, + "grad_norm": 1.3168065027250062, "learning_rate": 3.459002920076625e-06, - "loss": 0.236, + "loss": 0.2329, "step": 4384 }, { "epoch": 0.5998632010943913, - "grad_norm": 1.4628315843565174, + "grad_norm": 1.4382455218276176, "learning_rate": 3.456958831601719e-06, - "loss": 0.2065, + "loss": 0.2066, "step": 4385 }, { "epoch": 0.6, - "grad_norm": 1.1869229111031532, + "grad_norm": 1.1818434912020965, "learning_rate": 3.4549150281252635e-06, - "loss": 0.1828, + "loss": 0.1843, "step": 4386 }, { "epoch": 0.6001367989056088, - "grad_norm": 1.168419514937601, + "grad_norm": 1.1665326201267312, "learning_rate": 3.4528715100247463e-06, - "loss": 0.1911, + "loss": 0.191, "step": 4387 }, { "epoch": 0.6002735978112175, - "grad_norm": 1.1606890579457096, + "grad_norm": 1.158507958796843, "learning_rate": 3.450828277677606e-06, - "loss": 0.2024, + "loss": 0.202, "step": 4388 }, { "epoch": 0.6004103967168263, - "grad_norm": 1.1500422797269347, + "grad_norm": 1.14851347637295, "learning_rate": 3.448785331461222e-06, - "loss": 0.1647, + "loss": 0.1646, "step": 4389 }, { "epoch": 0.600547195622435, - "grad_norm": 1.122364163319269, + "grad_norm": 1.1146581035387206, "learning_rate": 3.4467426717529283e-06, - "loss": 0.1626, + "loss": 0.1633, "step": 4390 }, { "epoch": 0.6006839945280438, - "grad_norm": 1.35533580767613, + "grad_norm": 1.355117114454412, "learning_rate": 3.4447002989299992e-06, - "loss": 0.222, + "loss": 0.2245, "step": 4391 }, { "epoch": 0.6008207934336526, - "grad_norm": 1.5177485671748165, + "grad_norm": 1.4573813440834833, "learning_rate": 3.4426582133696617e-06, - "loss": 0.2297, + "loss": 0.2279, "step": 4392 }, { "epoch": 0.6009575923392613, - "grad_norm": 0.9452385646453261, + "grad_norm": 0.9154904883154588, "learning_rate": 3.440616415449087e-06, - "loss": 0.1365, + "loss": 0.1354, "step": 4393 }, { "epoch": 0.6010943912448701, - "grad_norm": 1.3637014681519959, + "grad_norm": 1.382192119528603, "learning_rate": 3.4385749055453926e-06, - "loss": 0.2135, + "loss": 0.2144, "step": 4394 }, { "epoch": 0.6012311901504788, - "grad_norm": 1.2711736030404461, + "grad_norm": 1.2793999428298957, "learning_rate": 3.436533684035647e-06, - "loss": 0.1716, + "loss": 0.1732, "step": 4395 }, { "epoch": 0.6013679890560876, - "grad_norm": 1.2291527826501398, + "grad_norm": 1.2076264120364995, "learning_rate": 3.4344927512968562e-06, "loss": 0.1863, "step": 4396 }, { "epoch": 0.6015047879616963, - "grad_norm": 1.2819173845203622, + "grad_norm": 1.284716412566227, "learning_rate": 3.4324521077059848e-06, - "loss": 0.1816, + "loss": 0.1822, "step": 4397 }, { "epoch": 0.6016415868673051, - "grad_norm": 1.2239460099992399, + "grad_norm": 1.2197056103894812, "learning_rate": 3.430411753639934e-06, - "loss": 0.2195, + "loss": 0.2192, "step": 4398 }, { "epoch": 0.6017783857729139, - "grad_norm": 1.1427878756800136, + "grad_norm": 1.1245030633308277, "learning_rate": 3.428371689475556e-06, - "loss": 0.1828, + "loss": 0.1806, "step": 4399 }, { "epoch": 0.6019151846785226, - "grad_norm": 1.3774864425493172, + "grad_norm": 1.3641206023642456, "learning_rate": 3.4263319155896514e-06, - "loss": 0.1805, + "loss": 0.1796, "step": 4400 }, { "epoch": 0.6019151846785226, - "eval_loss": 0.18234367668628693, - "eval_runtime": 5.9363, - "eval_samples_per_second": 5.054, - "eval_steps_per_second": 1.348, + "eval_loss": 0.1819891333580017, + "eval_runtime": 5.9228, + "eval_samples_per_second": 5.065, + "eval_steps_per_second": 1.351, "step": 4400 }, { "epoch": 0.6020519835841314, - "grad_norm": 0.9305911688208777, + "grad_norm": 0.9371503750945447, "learning_rate": 3.424292432358963e-06, - "loss": 0.1518, + "loss": 0.1528, "step": 4401 }, { "epoch": 0.6021887824897401, - "grad_norm": 1.257070476235944, + "grad_norm": 1.2293190301775625, "learning_rate": 3.4222532401601816e-06, - "loss": 0.1968, + "loss": 0.1951, "step": 4402 }, { "epoch": 0.6023255813953489, - "grad_norm": 1.1702483711616583, + "grad_norm": 1.1611647222343016, "learning_rate": 3.4202143393699437e-06, - "loss": 0.1661, + "loss": 0.1656, "step": 4403 }, { "epoch": 0.6024623803009576, - "grad_norm": 1.293454732155946, + "grad_norm": 1.2975305232163592, "learning_rate": 3.4181757303648366e-06, - "loss": 0.1969, + "loss": 0.1952, "step": 4404 }, { "epoch": 0.6025991792065664, - "grad_norm": 1.1725437173633098, + "grad_norm": 1.1560304399253016, "learning_rate": 3.4161374135213842e-06, - "loss": 0.1601, + "loss": 0.1614, "step": 4405 }, { "epoch": 0.602735978112175, - "grad_norm": 1.1016617495110645, + "grad_norm": 1.10245614189965, "learning_rate": 3.4140993892160657e-06, "loss": 0.1688, "step": 4406 }, { "epoch": 0.6028727770177839, - "grad_norm": 1.256750386471504, + "grad_norm": 1.2556392516618295, "learning_rate": 3.4120616578253007e-06, - "loss": 0.2007, + "loss": 0.2021, "step": 4407 }, { "epoch": 0.6030095759233927, - "grad_norm": 1.2323423473573938, + "grad_norm": 1.217693151257713, "learning_rate": 3.4100242197254564e-06, - "loss": 0.189, + "loss": 0.1888, "step": 4408 }, { "epoch": 0.6031463748290014, - "grad_norm": 1.1243202048909449, + "grad_norm": 1.107191066489568, "learning_rate": 3.4079870752928477e-06, - "loss": 0.1643, + "loss": 0.1645, "step": 4409 }, { "epoch": 0.6032831737346102, - "grad_norm": 1.3144133733606589, + "grad_norm": 1.3074544945623268, "learning_rate": 3.4059502249037324e-06, - "loss": 0.1695, + "loss": 0.1706, "step": 4410 }, { "epoch": 0.6034199726402188, - "grad_norm": 1.1783201115159934, + "grad_norm": 1.1513945504555607, "learning_rate": 3.4039136689343165e-06, - "loss": 0.1493, + "loss": 0.1497, "step": 4411 }, { "epoch": 0.6035567715458277, - "grad_norm": 1.2687949051499647, + "grad_norm": 1.2543574612181372, "learning_rate": 3.4018774077607477e-06, - "loss": 0.1789, + "loss": 0.1779, "step": 4412 }, { "epoch": 0.6036935704514363, - "grad_norm": 1.2934759666749482, + "grad_norm": 1.2866585015645986, "learning_rate": 3.399841441759124e-06, - "loss": 0.1799, + "loss": 0.179, "step": 4413 }, { "epoch": 0.6038303693570451, - "grad_norm": 1.2825660926134783, + "grad_norm": 1.2607040120515984, "learning_rate": 3.3978057713054847e-06, - "loss": 0.1766, + "loss": 0.1761, "step": 4414 }, { "epoch": 0.603967168262654, - "grad_norm": 1.188546631416445, + "grad_norm": 1.1738003722218913, "learning_rate": 3.3957703967758173e-06, "loss": 0.1988, "step": 4415 }, { "epoch": 0.6041039671682626, - "grad_norm": 1.0176092939882786, + "grad_norm": 1.019281678637119, "learning_rate": 3.393735318546055e-06, - "loss": 0.1831, + "loss": 0.1843, "step": 4416 }, { "epoch": 0.6042407660738714, - "grad_norm": 1.4105902136922583, + "grad_norm": 1.400827541973673, "learning_rate": 3.391700536992073e-06, - "loss": 0.2212, + "loss": 0.2215, "step": 4417 }, { "epoch": 0.6043775649794801, - "grad_norm": 1.7540866732350784, + "grad_norm": 1.7797550411523142, "learning_rate": 3.389666052489697e-06, - "loss": 0.2434, + "loss": 0.2462, "step": 4418 }, { "epoch": 0.604514363885089, - "grad_norm": 1.2367590222484655, + "grad_norm": 1.2270747421900803, "learning_rate": 3.3876318654146907e-06, - "loss": 0.1858, + "loss": 0.1852, "step": 4419 }, { "epoch": 0.6046511627906976, - "grad_norm": 1.2704899377617815, + "grad_norm": 1.255128920178736, "learning_rate": 3.3855979761427705e-06, - "loss": 0.1799, + "loss": 0.1807, "step": 4420 }, { "epoch": 0.6047879616963064, - "grad_norm": 1.1783988019647718, + "grad_norm": 1.134309843678942, "learning_rate": 3.3835643850495904e-06, - "loss": 0.1867, + "loss": 0.1839, "step": 4421 }, { "epoch": 0.6049247606019151, - "grad_norm": 1.2681479112757195, + "grad_norm": 1.280014978934906, "learning_rate": 3.3815310925107556e-06, - "loss": 0.1827, + "loss": 0.1837, "step": 4422 }, { "epoch": 0.6050615595075239, - "grad_norm": 1.522461803792257, + "grad_norm": 1.5098738557365539, "learning_rate": 3.3794980989018124e-06, - "loss": 0.2515, + "loss": 0.2535, "step": 4423 }, { "epoch": 0.6051983584131327, - "grad_norm": 1.4434214141470356, + "grad_norm": 1.4439862761732998, "learning_rate": 3.3774654045982536e-06, - "loss": 0.2205, + "loss": 0.2217, "step": 4424 }, { "epoch": 0.6053351573187414, - "grad_norm": 1.322904376765344, + "grad_norm": 1.298769758225511, "learning_rate": 3.3754330099755178e-06, - "loss": 0.1982, + "loss": 0.1988, "step": 4425 }, { "epoch": 0.6054719562243502, - "grad_norm": 1.618877462085783, + "grad_norm": 1.6169273267714304, "learning_rate": 3.373400915408983e-06, - "loss": 0.205, + "loss": 0.2049, "step": 4426 }, { "epoch": 0.6056087551299589, - "grad_norm": 1.2787208829850558, + "grad_norm": 1.2563589723737798, "learning_rate": 3.3713691212739797e-06, - "loss": 0.1993, + "loss": 0.2002, "step": 4427 }, { "epoch": 0.6057455540355677, - "grad_norm": 1.1466190017580262, + "grad_norm": 1.1393467723263082, "learning_rate": 3.369337627945774e-06, - "loss": 0.1709, + "loss": 0.1703, "step": 4428 }, { "epoch": 0.6058823529411764, - "grad_norm": 1.2677899449315135, + "grad_norm": 1.265472543718632, "learning_rate": 3.3673064357995844e-06, - "loss": 0.1875, + "loss": 0.1869, "step": 4429 }, { "epoch": 0.6060191518467852, - "grad_norm": 1.0288595733033261, + "grad_norm": 1.02864197170042, "learning_rate": 3.3652755452105685e-06, - "loss": 0.1805, + "loss": 0.1813, "step": 4430 }, { "epoch": 0.606155950752394, - "grad_norm": 1.3857974961468864, + "grad_norm": 1.351593014205259, "learning_rate": 3.363244956553831e-06, - "loss": 0.2323, + "loss": 0.2309, "step": 4431 }, { "epoch": 0.6062927496580027, - "grad_norm": 0.944274608046234, + "grad_norm": 0.9453142112150466, "learning_rate": 3.3612146702044223e-06, - "loss": 0.1285, + "loss": 0.1289, "step": 4432 }, { "epoch": 0.6064295485636115, - "grad_norm": 1.3927041395762447, + "grad_norm": 1.3625900046095065, "learning_rate": 3.3591846865373303e-06, - "loss": 0.2319, + "loss": 0.2318, "step": 4433 }, { "epoch": 0.6065663474692202, - "grad_norm": 1.1657125150498104, + "grad_norm": 1.1601643708531646, "learning_rate": 3.3571550059274947e-06, - "loss": 0.2012, + "loss": 0.2013, "step": 4434 }, { "epoch": 0.606703146374829, - "grad_norm": 1.1405686122177787, + "grad_norm": 1.1479345038447302, "learning_rate": 3.3551256287497936e-06, - "loss": 0.1555, + "loss": 0.1582, "step": 4435 }, { "epoch": 0.6068399452804377, - "grad_norm": 1.172410512809746, + "grad_norm": 1.1639482435383672, "learning_rate": 3.3530965553790528e-06, - "loss": 0.1835, + "loss": 0.183, "step": 4436 }, { "epoch": 0.6069767441860465, - "grad_norm": 1.0812124141699706, + "grad_norm": 1.0887376524151922, "learning_rate": 3.351067786190039e-06, - "loss": 0.1423, + "loss": 0.1438, "step": 4437 }, { "epoch": 0.6071135430916552, - "grad_norm": 1.4977476219860806, + "grad_norm": 1.5340074235957417, "learning_rate": 3.3490393215574674e-06, - "loss": 0.1941, + "loss": 0.197, "step": 4438 }, { "epoch": 0.607250341997264, - "grad_norm": 1.3782518793918692, + "grad_norm": 1.3785506189047345, "learning_rate": 3.347011161855989e-06, - "loss": 0.2002, + "loss": 0.2018, "step": 4439 }, { "epoch": 0.6073871409028728, - "grad_norm": 1.4333238831655004, + "grad_norm": 1.4208590801486805, "learning_rate": 3.344983307460207e-06, - "loss": 0.19, + "loss": 0.1906, "step": 4440 }, { "epoch": 0.6075239398084815, - "grad_norm": 1.1144866110202936, + "grad_norm": 1.1096165672013274, "learning_rate": 3.3429557587446638e-06, - "loss": 0.1695, + "loss": 0.1704, "step": 4441 }, { "epoch": 0.6076607387140903, - "grad_norm": 1.4062551304808082, + "grad_norm": 1.3909766765590452, "learning_rate": 3.3409285160838446e-06, - "loss": 0.1931, + "loss": 0.1944, "step": 4442 }, { "epoch": 0.607797537619699, - "grad_norm": 1.4342387723427263, + "grad_norm": 1.4084538074799273, "learning_rate": 3.338901579852182e-06, - "loss": 0.2141, + "loss": 0.2147, "step": 4443 }, { "epoch": 0.6079343365253078, - "grad_norm": 1.2547951153438444, + "grad_norm": 1.2492572386356153, "learning_rate": 3.336874950424046e-06, - "loss": 0.1687, + "loss": 0.1673, "step": 4444 }, { "epoch": 0.6080711354309165, - "grad_norm": 1.4205171562910583, + "grad_norm": 1.4047340582188175, "learning_rate": 3.3348486281737557e-06, - "loss": 0.1634, + "loss": 0.1644, "step": 4445 }, { "epoch": 0.6082079343365253, - "grad_norm": 1.3799160579881014, + "grad_norm": 1.3636994581807442, "learning_rate": 3.3328226134755703e-06, - "loss": 0.2269, + "loss": 0.2263, "step": 4446 }, { "epoch": 0.6083447332421341, - "grad_norm": 1.2547439423171673, + "grad_norm": 1.19167334117971, "learning_rate": 3.330796906703693e-06, - "loss": 0.1832, + "loss": 0.1796, "step": 4447 }, { "epoch": 0.6084815321477428, - "grad_norm": 1.2271003372907339, + "grad_norm": 1.2152120314977144, "learning_rate": 3.328771508232273e-06, - "loss": 0.1753, + "loss": 0.175, "step": 4448 }, { "epoch": 0.6086183310533516, - "grad_norm": 1.1914121415635879, + "grad_norm": 1.1942812943934866, "learning_rate": 3.3267464184353956e-06, - "loss": 0.1895, + "loss": 0.1891, "step": 4449 }, { "epoch": 0.6087551299589603, - "grad_norm": 1.2559953140185862, + "grad_norm": 1.2533257381016034, "learning_rate": 3.3247216376870968e-06, - "loss": 0.204, + "loss": 0.2044, "step": 4450 }, { "epoch": 0.6088919288645691, - "grad_norm": 1.0747867280561458, + "grad_norm": 1.0623932365484465, "learning_rate": 3.322697166361348e-06, - "loss": 0.168, + "loss": 0.1691, "step": 4451 }, { "epoch": 0.6090287277701778, - "grad_norm": 1.2618406677428453, + "grad_norm": 1.244609310642294, "learning_rate": 3.320673004832071e-06, - "loss": 0.1983, + "loss": 0.1976, "step": 4452 }, { "epoch": 0.6091655266757866, - "grad_norm": 1.3592584408386816, + "grad_norm": 1.3710304877857498, "learning_rate": 3.318649153473125e-06, - "loss": 0.2367, + "loss": 0.2374, "step": 4453 }, { "epoch": 0.6093023255813953, - "grad_norm": 1.1346242737969439, + "grad_norm": 1.118219435584332, "learning_rate": 3.316625612658315e-06, - "loss": 0.1618, + "loss": 0.1601, "step": 4454 }, { "epoch": 0.6094391244870041, - "grad_norm": 1.333644022338903, + "grad_norm": 1.3238671954109353, "learning_rate": 3.3146023827613847e-06, - "loss": 0.2098, + "loss": 0.2125, "step": 4455 }, { "epoch": 0.6095759233926129, - "grad_norm": 1.2118333086031932, + "grad_norm": 1.2040418652545162, "learning_rate": 3.312579464156025e-06, - "loss": 0.1739, + "loss": 0.1731, "step": 4456 }, { "epoch": 0.6097127222982216, - "grad_norm": 1.5792510822532304, + "grad_norm": 1.5633841815966358, "learning_rate": 3.3105568572158695e-06, - "loss": 0.2121, + "loss": 0.2098, "step": 4457 }, { "epoch": 0.6098495212038304, - "grad_norm": 1.1338596295438375, + "grad_norm": 1.1321916380507706, "learning_rate": 3.3085345623144872e-06, - "loss": 0.1941, + "loss": 0.1932, "step": 4458 }, { "epoch": 0.6099863201094391, - "grad_norm": 1.249951146784092, + "grad_norm": 1.2525480215379396, "learning_rate": 3.3065125798253985e-06, - "loss": 0.2012, + "loss": 0.2019, "step": 4459 }, { "epoch": 0.6101231190150479, - "grad_norm": 1.2966857941892944, + "grad_norm": 1.3709668787577, "learning_rate": 3.304490910122058e-06, - "loss": 0.2018, + "loss": 0.203, "step": 4460 }, { "epoch": 0.6102599179206566, - "grad_norm": 1.105029111163702, + "grad_norm": 1.1080680664292393, "learning_rate": 3.302469553577872e-06, - "loss": 0.175, + "loss": 0.1763, "step": 4461 }, { "epoch": 0.6103967168262654, - "grad_norm": 1.2336397622238016, + "grad_norm": 1.2302922167225472, "learning_rate": 3.3004485105661765e-06, - "loss": 0.2102, + "loss": 0.2109, "step": 4462 }, { "epoch": 0.6105335157318742, - "grad_norm": 1.138213501161154, + "grad_norm": 1.1451125212613813, "learning_rate": 3.2984277814602615e-06, - "loss": 0.1896, + "loss": 0.1918, "step": 4463 }, { "epoch": 0.6106703146374829, - "grad_norm": 1.1780242832315082, + "grad_norm": 1.1611342210091284, "learning_rate": 3.296407366633354e-06, - "loss": 0.164, + "loss": 0.1623, "step": 4464 }, { "epoch": 0.6108071135430917, - "grad_norm": 1.3639990235677755, + "grad_norm": 1.3444011760557517, "learning_rate": 3.2943872664586196e-06, - "loss": 0.1773, + "loss": 0.1722, "step": 4465 }, { "epoch": 0.6109439124487004, - "grad_norm": 1.6780202180013293, + "grad_norm": 1.6563875052282708, "learning_rate": 3.2923674813091722e-06, - "loss": 0.2404, + "loss": 0.2398, "step": 4466 }, { "epoch": 0.6110807113543092, - "grad_norm": 1.129646918956515, + "grad_norm": 1.1308540565848693, "learning_rate": 3.290348011558063e-06, - "loss": 0.1795, + "loss": 0.18, "step": 4467 }, { "epoch": 0.6112175102599179, - "grad_norm": 1.4122172039821896, + "grad_norm": 1.4360381922636534, "learning_rate": 3.288328857578288e-06, - "loss": 0.2017, + "loss": 0.2063, "step": 4468 }, { "epoch": 0.6113543091655267, - "grad_norm": 1.2152033284217476, + "grad_norm": 1.2225558437187147, "learning_rate": 3.2863100197427804e-06, - "loss": 0.1754, + "loss": 0.1744, "step": 4469 }, { "epoch": 0.6114911080711354, - "grad_norm": 1.3340131621585265, + "grad_norm": 1.3269769573834662, "learning_rate": 3.284291498424422e-06, - "loss": 0.2068, + "loss": 0.2074, "step": 4470 }, { "epoch": 0.6116279069767442, - "grad_norm": 1.2910733486918369, + "grad_norm": 1.2915431255063343, "learning_rate": 3.282273293996027e-06, - "loss": 0.1664, + "loss": 0.1661, "step": 4471 }, { "epoch": 0.611764705882353, - "grad_norm": 1.1512341289237735, + "grad_norm": 1.208440490234117, "learning_rate": 3.2802554068303595e-06, - "loss": 0.1526, + "loss": 0.1528, "step": 4472 }, { "epoch": 0.6119015047879617, - "grad_norm": 1.0142896995642852, + "grad_norm": 1.011101633788701, "learning_rate": 3.2782378373001217e-06, - "loss": 0.1479, + "loss": 0.1493, "step": 4473 }, { "epoch": 0.6120383036935705, - "grad_norm": 1.125142918830682, + "grad_norm": 1.1004394600059435, "learning_rate": 3.2762205857779562e-06, - "loss": 0.2109, + "loss": 0.2113, "step": 4474 }, { "epoch": 0.6121751025991792, - "grad_norm": 1.1478222314878652, + "grad_norm": 1.1641688566140205, "learning_rate": 3.274203652636448e-06, - "loss": 0.1629, + "loss": 0.1637, "step": 4475 }, { "epoch": 0.612311901504788, - "grad_norm": 1.292065265774064, + "grad_norm": 1.2802804176532332, "learning_rate": 3.2721870382481213e-06, - "loss": 0.2088, + "loss": 0.2074, "step": 4476 }, { "epoch": 0.6124487004103967, - "grad_norm": 1.1938272094800755, + "grad_norm": 1.1774710566190292, "learning_rate": 3.2701707429854475e-06, - "loss": 0.1664, + "loss": 0.1671, "step": 4477 }, { "epoch": 0.6125854993160055, - "grad_norm": 1.1411168522947586, + "grad_norm": 1.1337285239984278, "learning_rate": 3.268154767220829e-06, - "loss": 0.1709, + "loss": 0.1707, "step": 4478 }, { "epoch": 0.6127222982216143, - "grad_norm": 1.201361636970191, + "grad_norm": 1.1462722415052922, "learning_rate": 3.2661391113266185e-06, - "loss": 0.1793, + "loss": 0.176, "step": 4479 }, { "epoch": 0.612859097127223, - "grad_norm": 1.1954462491701479, + "grad_norm": 1.1921608596863258, "learning_rate": 3.2641237756751066e-06, - "loss": 0.1713, + "loss": 0.1714, "step": 4480 }, { "epoch": 0.6129958960328318, - "grad_norm": 1.3474048884703531, + "grad_norm": 1.3336623645031356, "learning_rate": 3.262108760638521e-06, - "loss": 0.1951, + "loss": 0.1956, "step": 4481 }, { "epoch": 0.6131326949384405, - "grad_norm": 1.2983628288631772, + "grad_norm": 1.285281863100525, "learning_rate": 3.2600940665890362e-06, - "loss": 0.1931, + "loss": 0.1934, "step": 4482 }, { "epoch": 0.6132694938440493, - "grad_norm": 1.2821250341771042, + "grad_norm": 1.2599316060803953, "learning_rate": 3.2580796938987626e-06, - "loss": 0.2244, + "loss": 0.2261, "step": 4483 }, { "epoch": 0.613406292749658, - "grad_norm": 1.1365754454157648, + "grad_norm": 1.1169161672222248, "learning_rate": 3.2560656429397565e-06, - "loss": 0.1715, + "loss": 0.1699, "step": 4484 }, { "epoch": 0.6135430916552668, - "grad_norm": 1.2103789059408623, + "grad_norm": 1.1857985140011145, "learning_rate": 3.2540519140840064e-06, - "loss": 0.1831, + "loss": 0.1833, "step": 4485 }, { "epoch": 0.6136798905608755, - "grad_norm": 1.198023612887233, + "grad_norm": 1.1962975660216437, "learning_rate": 3.252038507703451e-06, - "loss": 0.1793, + "loss": 0.1801, "step": 4486 }, { "epoch": 0.6138166894664843, - "grad_norm": 1.2565341202009028, + "grad_norm": 1.2290079299494385, "learning_rate": 3.2500254241699615e-06, - "loss": 0.1633, + "loss": 0.1613, "step": 4487 }, { "epoch": 0.6139534883720931, - "grad_norm": 1.6936720311108, + "grad_norm": 1.662610920465677, "learning_rate": 3.2480126638553533e-06, - "loss": 0.2641, + "loss": 0.2569, "step": 4488 }, { "epoch": 0.6140902872777018, - "grad_norm": 1.290896268050563, + "grad_norm": 1.272645310749474, "learning_rate": 3.246000227131384e-06, - "loss": 0.1847, + "loss": 0.1839, "step": 4489 }, { "epoch": 0.6142270861833106, - "grad_norm": 1.261482804925887, + "grad_norm": 1.2575273325441536, "learning_rate": 3.2439881143697465e-06, - "loss": 0.1655, + "loss": 0.1653, "step": 4490 }, { "epoch": 0.6143638850889193, - "grad_norm": 1.5595142406749167, + "grad_norm": 1.4275973523954462, "learning_rate": 3.2419763259420794e-06, - "loss": 0.2065, + "loss": 0.2039, "step": 4491 }, { "epoch": 0.6145006839945281, - "grad_norm": 1.4611254007272187, + "grad_norm": 1.4356185346521535, "learning_rate": 3.2399648622199543e-06, - "loss": 0.2054, + "loss": 0.2032, "step": 4492 }, { "epoch": 0.6146374829001368, - "grad_norm": 1.2226199033326521, + "grad_norm": 1.216265051963094, "learning_rate": 3.2379537235748915e-06, - "loss": 0.1808, + "loss": 0.1828, "step": 4493 }, { "epoch": 0.6147742818057456, - "grad_norm": 1.156589420869323, + "grad_norm": 1.125726165640486, "learning_rate": 3.2359429103783415e-06, - "loss": 0.1628, + "loss": 0.1596, "step": 4494 }, { "epoch": 0.6149110807113544, - "grad_norm": 1.28696862224344, + "grad_norm": 1.2830686629449952, "learning_rate": 3.233932423001702e-06, - "loss": 0.203, + "loss": 0.204, "step": 4495 }, { "epoch": 0.6150478796169631, - "grad_norm": 1.1345902769436318, + "grad_norm": 1.1263288598333012, "learning_rate": 3.231922261816311e-06, - "loss": 0.1975, + "loss": 0.1995, "step": 4496 }, { "epoch": 0.6151846785225719, - "grad_norm": 1.408194789071063, + "grad_norm": 1.3967550143290706, "learning_rate": 3.22991242719344e-06, - "loss": 0.1981, + "loss": 0.1986, "step": 4497 }, { "epoch": 0.6153214774281806, - "grad_norm": 1.4980687166089373, + "grad_norm": 1.465852087710738, "learning_rate": 3.2279029195043075e-06, - "loss": 0.2647, + "loss": 0.2602, "step": 4498 }, { "epoch": 0.6154582763337894, - "grad_norm": 1.1353909762389467, + "grad_norm": 1.126755659017061, "learning_rate": 3.225893739120064e-06, - "loss": 0.1527, + "loss": 0.1528, "step": 4499 }, { "epoch": 0.615595075239398, - "grad_norm": 1.2666415858043463, + "grad_norm": 1.2652427956735388, "learning_rate": 3.2238848864118073e-06, - "loss": 0.1904, + "loss": 0.1925, "step": 4500 }, { "epoch": 0.615595075239398, - "eval_loss": 0.18099480867385864, - "eval_runtime": 5.9067, - "eval_samples_per_second": 5.079, - "eval_steps_per_second": 1.354, + "eval_loss": 0.1807713806629181, + "eval_runtime": 5.9293, + "eval_samples_per_second": 5.06, + "eval_steps_per_second": 1.349, "step": 4500 }, { "epoch": 0.6157318741450069, - "grad_norm": 0.9494578926653328, + "grad_norm": 0.9373894330884135, "learning_rate": 3.2218763617505666e-06, - "loss": 0.1682, + "loss": 0.1691, "step": 4501 }, { "epoch": 0.6158686730506155, - "grad_norm": 1.1606572963746866, + "grad_norm": 1.1448286140457191, "learning_rate": 3.2198681655073184e-06, - "loss": 0.1622, + "loss": 0.1621, "step": 4502 }, { "epoch": 0.6160054719562243, - "grad_norm": 1.3491834736170312, + "grad_norm": 1.3233915424111102, "learning_rate": 3.2178602980529717e-06, - "loss": 0.2298, + "loss": 0.2297, "step": 4503 }, { "epoch": 0.6161422708618332, - "grad_norm": 1.137521509065, + "grad_norm": 1.125538866038847, "learning_rate": 3.2158527597583815e-06, - "loss": 0.1555, + "loss": 0.1549, "step": 4504 }, { "epoch": 0.6162790697674418, - "grad_norm": 1.2384001862175364, + "grad_norm": 1.2449859713233495, "learning_rate": 3.2138455509943365e-06, - "loss": 0.1926, + "loss": 0.1961, "step": 4505 }, { "epoch": 0.6164158686730506, - "grad_norm": 1.248422177306264, + "grad_norm": 1.2256733054055797, "learning_rate": 3.2118386721315662e-06, - "loss": 0.2156, + "loss": 0.2174, "step": 4506 }, { "epoch": 0.6165526675786593, - "grad_norm": 1.2252876208116057, + "grad_norm": 1.2248921103098795, "learning_rate": 3.209832123540742e-06, - "loss": 0.1786, + "loss": 0.1807, "step": 4507 }, { "epoch": 0.6166894664842681, - "grad_norm": 1.327196727788607, + "grad_norm": 1.3360426648412325, "learning_rate": 3.2078259055924676e-06, - "loss": 0.2048, + "loss": 0.2082, "step": 4508 }, { "epoch": 0.6168262653898768, - "grad_norm": 1.3123797851920274, + "grad_norm": 1.3002304845911665, "learning_rate": 3.205820018657294e-06, - "loss": 0.1686, + "loss": 0.1684, "step": 4509 }, { "epoch": 0.6169630642954856, - "grad_norm": 1.1987357956518334, + "grad_norm": 1.1889739121876763, "learning_rate": 3.2038144631057044e-06, - "loss": 0.1959, + "loss": 0.1967, "step": 4510 }, { "epoch": 0.6170998632010944, - "grad_norm": 1.3413205274773436, + "grad_norm": 1.3250586698392226, "learning_rate": 3.2018092393081236e-06, - "loss": 0.225, + "loss": 0.2232, "step": 4511 }, { "epoch": 0.6172366621067031, - "grad_norm": 1.1427599364856835, + "grad_norm": 1.135259841352454, "learning_rate": 3.199804347634915e-06, - "loss": 0.1725, + "loss": 0.1727, "step": 4512 }, { "epoch": 0.6173734610123119, - "grad_norm": 1.190846929987842, + "grad_norm": 1.146612461580772, "learning_rate": 3.1977997884563807e-06, - "loss": 0.1869, + "loss": 0.1862, "step": 4513 }, { "epoch": 0.6175102599179206, - "grad_norm": 1.1524685335530869, + "grad_norm": 1.1557400835042686, "learning_rate": 3.195795562142762e-06, - "loss": 0.1638, + "loss": 0.1648, "step": 4514 }, { "epoch": 0.6176470588235294, - "grad_norm": 1.238179547006115, + "grad_norm": 1.2233051610121408, "learning_rate": 3.1937916690642356e-06, - "loss": 0.2131, + "loss": 0.213, "step": 4515 }, { "epoch": 0.6177838577291381, - "grad_norm": 1.1253604539842867, + "grad_norm": 1.1198089826978372, "learning_rate": 3.191788109590922e-06, - "loss": 0.163, + "loss": 0.1622, "step": 4516 }, { "epoch": 0.6179206566347469, - "grad_norm": 1.2799427742750205, + "grad_norm": 1.2763644082830112, "learning_rate": 3.1897848840928735e-06, - "loss": 0.194, + "loss": 0.1942, "step": 4517 }, { "epoch": 0.6180574555403556, - "grad_norm": 1.2035343659589535, + "grad_norm": 1.1918716540534293, "learning_rate": 3.187781992940087e-06, - "loss": 0.1704, + "loss": 0.1718, "step": 4518 }, { "epoch": 0.6181942544459644, - "grad_norm": 1.1861128435174588, + "grad_norm": 1.1668079761990056, "learning_rate": 3.1857794365024926e-06, - "loss": 0.1649, + "loss": 0.1648, "step": 4519 }, { "epoch": 0.6183310533515732, - "grad_norm": 1.2267594058219626, + "grad_norm": 1.218600798723385, "learning_rate": 3.183777215149962e-06, - "loss": 0.1928, + "loss": 0.1913, "step": 4520 }, { "epoch": 0.6184678522571819, - "grad_norm": 1.0677396546166622, + "grad_norm": 1.0567243582940136, "learning_rate": 3.181775329252306e-06, - "loss": 0.1636, + "loss": 0.1637, "step": 4521 }, { "epoch": 0.6186046511627907, - "grad_norm": 1.2334739959404128, + "grad_norm": 1.238047240787748, "learning_rate": 3.1797737791792672e-06, - "loss": 0.1965, + "loss": 0.1968, "step": 4522 }, { "epoch": 0.6187414500683994, - "grad_norm": 1.3880812203742698, + "grad_norm": 1.3781044838927183, "learning_rate": 3.1777725653005347e-06, - "loss": 0.2252, + "loss": 0.2251, "step": 4523 }, { "epoch": 0.6188782489740082, - "grad_norm": 1.3856517600551215, + "grad_norm": 1.371943940051074, "learning_rate": 3.1757716879857266e-06, - "loss": 0.2191, + "loss": 0.2201, "step": 4524 }, { "epoch": 0.6190150478796169, - "grad_norm": 1.5359244705070292, + "grad_norm": 1.5258561933476171, "learning_rate": 3.1737711476044063e-06, - "loss": 0.2273, + "loss": 0.2251, "step": 4525 }, { "epoch": 0.6191518467852257, - "grad_norm": 1.2232347183650916, + "grad_norm": 1.2152356049099406, "learning_rate": 3.17177094452607e-06, - "loss": 0.1978, + "loss": 0.1992, "step": 4526 }, { "epoch": 0.6192886456908345, - "grad_norm": 1.0009159764518516, + "grad_norm": 0.9803917384268451, "learning_rate": 3.1697710791201545e-06, - "loss": 0.1689, + "loss": 0.1675, "step": 4527 }, { "epoch": 0.6194254445964432, - "grad_norm": 1.3701297837954474, + "grad_norm": 1.349785285503275, "learning_rate": 3.167771551756036e-06, - "loss": 0.2245, + "loss": 0.2223, "step": 4528 }, { "epoch": 0.619562243502052, - "grad_norm": 1.50682266583297, + "grad_norm": 1.4946218503009665, "learning_rate": 3.165772362803021e-06, - "loss": 0.2003, + "loss": 0.2029, "step": 4529 }, { "epoch": 0.6196990424076607, - "grad_norm": 1.0715247295842518, + "grad_norm": 1.0530844616583088, "learning_rate": 3.1637735126303616e-06, - "loss": 0.1756, + "loss": 0.1744, "step": 4530 }, { "epoch": 0.6198358413132695, - "grad_norm": 1.0480850877171843, + "grad_norm": 1.0329083270417347, "learning_rate": 3.1617750016072407e-06, - "loss": 0.1568, + "loss": 0.1574, "step": 4531 }, { "epoch": 0.6199726402188782, - "grad_norm": 1.391511225991808, + "grad_norm": 1.362023923364628, "learning_rate": 3.1597768301027843e-06, - "loss": 0.1856, + "loss": 0.1861, "step": 4532 }, { "epoch": 0.620109439124487, - "grad_norm": 1.4384133445294467, + "grad_norm": 1.405188492751111, "learning_rate": 3.1577789984860507e-06, - "loss": 0.2087, + "loss": 0.208, "step": 4533 }, { "epoch": 0.6202462380300957, - "grad_norm": 1.3756960652495265, + "grad_norm": 1.3588231360360294, "learning_rate": 3.1557815071260418e-06, - "loss": 0.2022, + "loss": 0.2018, "step": 4534 }, { "epoch": 0.6203830369357045, - "grad_norm": 1.456978239974743, + "grad_norm": 1.4438384867337364, "learning_rate": 3.1537843563916877e-06, - "loss": 0.1763, + "loss": 0.1765, "step": 4535 }, { "epoch": 0.6205198358413133, - "grad_norm": 1.2858591588357529, + "grad_norm": 1.2605495480422484, "learning_rate": 3.1517875466518626e-06, - "loss": 0.163, + "loss": 0.1616, "step": 4536 }, { "epoch": 0.620656634746922, - "grad_norm": 1.5715111817504543, + "grad_norm": 1.574958829866131, "learning_rate": 3.1497910782753784e-06, - "loss": 0.2382, + "loss": 0.2396, "step": 4537 }, { "epoch": 0.6207934336525308, - "grad_norm": 1.1780210153499808, + "grad_norm": 1.1856211374033794, "learning_rate": 3.1477949516309758e-06, - "loss": 0.1752, + "loss": 0.1772, "step": 4538 }, { "epoch": 0.6209302325581395, - "grad_norm": 1.158926898940046, + "grad_norm": 1.1664315430651384, "learning_rate": 3.1457991670873423e-06, - "loss": 0.1621, + "loss": 0.1615, "step": 4539 }, { "epoch": 0.6210670314637483, - "grad_norm": 1.130715927462296, + "grad_norm": 1.130134558699985, "learning_rate": 3.1438037250130944e-06, - "loss": 0.1636, + "loss": 0.1626, "step": 4540 }, { "epoch": 0.621203830369357, - "grad_norm": 1.2362949200714732, + "grad_norm": 1.2330074488807106, "learning_rate": 3.1418086257767923e-06, - "loss": 0.1908, + "loss": 0.1899, "step": 4541 }, { "epoch": 0.6213406292749658, - "grad_norm": 1.1960749382944378, + "grad_norm": 1.1879471090726215, "learning_rate": 3.139813869746925e-06, - "loss": 0.1591, + "loss": 0.1593, "step": 4542 }, { "epoch": 0.6214774281805746, - "grad_norm": 1.2893204503117728, + "grad_norm": 1.2865053919694784, "learning_rate": 3.1378194572919253e-06, - "loss": 0.1647, + "loss": 0.1674, "step": 4543 }, { "epoch": 0.6216142270861833, - "grad_norm": 1.3386758454317007, + "grad_norm": 1.3316167963287768, "learning_rate": 3.135825388780159e-06, - "loss": 0.1997, + "loss": 0.201, "step": 4544 }, { "epoch": 0.6217510259917921, - "grad_norm": 1.426552702232484, + "grad_norm": 1.435340519965341, "learning_rate": 3.1338316645799284e-06, - "loss": 0.2097, + "loss": 0.2115, "step": 4545 }, { "epoch": 0.6218878248974008, - "grad_norm": 1.3755904126615968, + "grad_norm": 1.3584655016154377, "learning_rate": 3.131838285059474e-06, - "loss": 0.2171, + "loss": 0.2149, "step": 4546 }, { "epoch": 0.6220246238030096, - "grad_norm": 1.2457950876280401, + "grad_norm": 1.229845595154304, "learning_rate": 3.129845250586968e-06, - "loss": 0.19, + "loss": 0.1903, "step": 4547 }, { "epoch": 0.6221614227086183, - "grad_norm": 1.1389548607289173, + "grad_norm": 1.1286566934079494, "learning_rate": 3.1278525615305267e-06, - "loss": 0.167, + "loss": 0.1677, "step": 4548 }, { "epoch": 0.6222982216142271, - "grad_norm": 1.2832371229884454, + "grad_norm": 1.2597972142329565, "learning_rate": 3.125860218258194e-06, - "loss": 0.1891, + "loss": 0.1887, "step": 4549 }, { "epoch": 0.6224350205198358, - "grad_norm": 1.5090019955843217, + "grad_norm": 1.4921354521795231, "learning_rate": 3.123868221137959e-06, - "loss": 0.2194, + "loss": 0.2181, "step": 4550 }, { "epoch": 0.6225718194254446, - "grad_norm": 1.2251674657768015, + "grad_norm": 1.249562430120731, "learning_rate": 3.1218765705377364e-06, - "loss": 0.1647, + "loss": 0.1638, "step": 4551 }, { "epoch": 0.6227086183310534, - "grad_norm": 1.3868727886199546, + "grad_norm": 1.3644982839793358, "learning_rate": 3.1198852668253854e-06, - "loss": 0.2094, + "loss": 0.2099, "step": 4552 }, { "epoch": 0.6228454172366621, - "grad_norm": 1.3037051958518486, + "grad_norm": 1.2901675037928308, "learning_rate": 3.117894310368699e-06, - "loss": 0.207, + "loss": 0.2044, "step": 4553 }, { "epoch": 0.6229822161422709, - "grad_norm": 1.2311487874727234, + "grad_norm": 1.2101235497399436, "learning_rate": 3.115903701535403e-06, - "loss": 0.1697, + "loss": 0.1691, "step": 4554 }, { "epoch": 0.6231190150478796, - "grad_norm": 1.2121111689426491, + "grad_norm": 1.196107002578476, "learning_rate": 3.113913440693164e-06, - "loss": 0.1695, + "loss": 0.1701, "step": 4555 }, { "epoch": 0.6232558139534884, - "grad_norm": 1.3737675843709458, + "grad_norm": 1.3530719159835494, "learning_rate": 3.111923528209577e-06, - "loss": 0.1991, + "loss": 0.1985, "step": 4556 }, { "epoch": 0.6233926128590971, - "grad_norm": 1.434775739011739, + "grad_norm": 1.4305020741907906, "learning_rate": 3.1099339644521843e-06, - "loss": 0.2484, + "loss": 0.2504, "step": 4557 }, { "epoch": 0.6235294117647059, - "grad_norm": 1.3809525376075509, + "grad_norm": 1.3465269646961062, "learning_rate": 3.107944749788449e-06, - "loss": 0.1926, + "loss": 0.1912, "step": 4558 }, { "epoch": 0.6236662106703147, - "grad_norm": 1.4845409386590764, + "grad_norm": 1.4916521280474886, "learning_rate": 3.1059558845857818e-06, - "loss": 0.2394, + "loss": 0.2433, "step": 4559 }, { "epoch": 0.6238030095759234, - "grad_norm": 1.3521091160532088, + "grad_norm": 1.3381397231981358, "learning_rate": 3.1039673692115253e-06, - "loss": 0.2123, + "loss": 0.2117, "step": 4560 }, { "epoch": 0.6239398084815322, - "grad_norm": 1.483273196012537, + "grad_norm": 1.4656967817219342, "learning_rate": 3.1019792040329535e-06, - "loss": 0.2053, + "loss": 0.2051, "step": 4561 }, { "epoch": 0.6240766073871409, - "grad_norm": 1.236963626023882, + "grad_norm": 1.220394049623176, "learning_rate": 3.0999913894172806e-06, - "loss": 0.1716, + "loss": 0.1689, "step": 4562 }, { "epoch": 0.6242134062927497, - "grad_norm": 1.172458050457913, + "grad_norm": 1.1510129265060849, "learning_rate": 3.0980039257316535e-06, - "loss": 0.1723, + "loss": 0.1729, "step": 4563 }, { "epoch": 0.6243502051983584, - "grad_norm": 1.135117433741681, + "grad_norm": 1.1263296161613927, "learning_rate": 3.096016813343158e-06, - "loss": 0.1678, + "loss": 0.1666, "step": 4564 }, { "epoch": 0.6244870041039672, - "grad_norm": 1.0820215811964473, + "grad_norm": 1.0577655846771945, "learning_rate": 3.0940300526188085e-06, - "loss": 0.1624, + "loss": 0.1604, "step": 4565 }, { "epoch": 0.6246238030095759, - "grad_norm": 1.2587351758830112, + "grad_norm": 1.2450937624089489, "learning_rate": 3.0920436439255608e-06, - "loss": 0.1797, + "loss": 0.1782, "step": 4566 }, { "epoch": 0.6247606019151847, - "grad_norm": 1.3268853119573696, + "grad_norm": 1.3086618342102996, "learning_rate": 3.0900575876302995e-06, - "loss": 0.1956, + "loss": 0.1969, "step": 4567 }, { "epoch": 0.6248974008207935, - "grad_norm": 1.0676817764592659, + "grad_norm": 1.0728208188678114, "learning_rate": 3.08807188409985e-06, - "loss": 0.1595, + "loss": 0.1593, "step": 4568 }, { "epoch": 0.6250341997264022, - "grad_norm": 1.176185669611275, + "grad_norm": 1.1947428072484874, "learning_rate": 3.086086533700971e-06, - "loss": 0.1895, + "loss": 0.1929, "step": 4569 }, { "epoch": 0.625170998632011, - "grad_norm": 1.1828130209533037, + "grad_norm": 1.167279848714866, "learning_rate": 3.084101536800353e-06, - "loss": 0.1906, + "loss": 0.1884, "step": 4570 }, { "epoch": 0.6253077975376197, - "grad_norm": 1.3255808891319423, + "grad_norm": 1.314271317862246, "learning_rate": 3.0821168937646262e-06, - "loss": 0.1895, + "loss": 0.193, "step": 4571 }, { "epoch": 0.6254445964432285, - "grad_norm": 1.246664528997544, + "grad_norm": 1.2436946255551118, "learning_rate": 3.080132604960349e-06, - "loss": 0.2042, + "loss": 0.2047, "step": 4572 }, { "epoch": 0.6255813953488372, - "grad_norm": 1.3793953217616206, + "grad_norm": 1.347525771941791, "learning_rate": 3.078148670754022e-06, - "loss": 0.2324, + "loss": 0.2323, "step": 4573 }, { "epoch": 0.625718194254446, - "grad_norm": 1.1948507843990679, + "grad_norm": 1.1694220964384978, "learning_rate": 3.076165091512071e-06, - "loss": 0.149, + "loss": 0.1493, "step": 4574 }, { "epoch": 0.6258549931600548, - "grad_norm": 1.2178479214801419, + "grad_norm": 1.2034497828274784, "learning_rate": 3.074181867600865e-06, - "loss": 0.1894, + "loss": 0.1887, "step": 4575 }, { "epoch": 0.6259917920656635, - "grad_norm": 1.154019944234776, + "grad_norm": 1.1397584083022174, "learning_rate": 3.0721989993867043e-06, - "loss": 0.2038, + "loss": 0.1997, "step": 4576 }, { "epoch": 0.6261285909712723, - "grad_norm": 1.3689577406902567, + "grad_norm": 1.3666438153817513, "learning_rate": 3.070216487235821e-06, - "loss": 0.1914, + "loss": 0.1895, "step": 4577 }, { "epoch": 0.626265389876881, - "grad_norm": 1.2966022463738511, + "grad_norm": 1.2872498898736848, "learning_rate": 3.0682343315143847e-06, - "loss": 0.2084, + "loss": 0.2098, "step": 4578 }, { "epoch": 0.6264021887824898, - "grad_norm": 1.257989171251903, + "grad_norm": 1.2432741467458628, "learning_rate": 3.0662525325884973e-06, - "loss": 0.1523, + "loss": 0.1521, "step": 4579 }, { "epoch": 0.6265389876880985, - "grad_norm": 1.2433512694527329, + "grad_norm": 1.2249190855137604, "learning_rate": 3.0642710908241973e-06, - "loss": 0.1661, + "loss": 0.1645, "step": 4580 }, { "epoch": 0.6266757865937073, - "grad_norm": 1.4549273502396847, + "grad_norm": 1.451216101598259, "learning_rate": 3.0622900065874523e-06, - "loss": 0.1817, + "loss": 0.1818, "step": 4581 }, { "epoch": 0.626812585499316, - "grad_norm": 1.2334529923322186, + "grad_norm": 1.2348654104737635, "learning_rate": 3.0603092802441703e-06, - "loss": 0.193, + "loss": 0.1953, "step": 4582 }, { "epoch": 0.6269493844049248, - "grad_norm": 1.0672512012048558, + "grad_norm": 1.0499811655413214, "learning_rate": 3.0583289121601856e-06, - "loss": 0.1687, + "loss": 0.1678, "step": 4583 }, { "epoch": 0.6270861833105336, - "grad_norm": 1.0583810587562192, + "grad_norm": 1.0235354790972695, "learning_rate": 3.056348902701274e-06, - "loss": 0.1724, + "loss": 0.1721, "step": 4584 }, { "epoch": 0.6272229822161423, - "grad_norm": 1.1675251138259999, + "grad_norm": 1.1388614305711964, "learning_rate": 3.0543692522331414e-06, - "loss": 0.1611, + "loss": 0.1571, "step": 4585 }, { "epoch": 0.6273597811217511, - "grad_norm": 1.1770147556684794, + "grad_norm": 1.1705742456280746, "learning_rate": 3.0523899611214257e-06, - "loss": 0.211, + "loss": 0.2116, "step": 4586 }, { "epoch": 0.6274965800273598, - "grad_norm": 1.3418924361189875, + "grad_norm": 1.3093088446247787, "learning_rate": 3.0504110297317036e-06, - "loss": 0.1861, + "loss": 0.1815, "step": 4587 }, { "epoch": 0.6276333789329686, - "grad_norm": 1.23139266944872, + "grad_norm": 1.2215954381593888, "learning_rate": 3.0484324584294784e-06, - "loss": 0.1819, + "loss": 0.1826, "step": 4588 }, { "epoch": 0.6277701778385772, - "grad_norm": 1.257626471279138, + "grad_norm": 1.2564067269408015, "learning_rate": 3.046454247580194e-06, - "loss": 0.1898, + "loss": 0.1904, "step": 4589 }, { "epoch": 0.627906976744186, - "grad_norm": 1.0847745749849174, + "grad_norm": 1.0759540389124418, "learning_rate": 3.044476397549221e-06, - "loss": 0.1478, + "loss": 0.1467, "step": 4590 }, { "epoch": 0.6280437756497949, - "grad_norm": 1.3815999660299612, + "grad_norm": 1.3654006604006461, "learning_rate": 3.042498908701868e-06, - "loss": 0.188, + "loss": 0.1882, "step": 4591 }, { "epoch": 0.6281805745554035, - "grad_norm": 1.255337201133176, + "grad_norm": 1.2565733189878163, "learning_rate": 3.040521781403377e-06, - "loss": 0.1707, + "loss": 0.1703, "step": 4592 }, { "epoch": 0.6283173734610124, - "grad_norm": 1.4416404059698658, + "grad_norm": 1.4221246936165872, "learning_rate": 3.03854501601892e-06, - "loss": 0.2068, + "loss": 0.207, "step": 4593 }, { "epoch": 0.628454172366621, - "grad_norm": 1.085323550279929, + "grad_norm": 1.0726180799176688, "learning_rate": 3.0365686129136067e-06, - "loss": 0.1497, + "loss": 0.1508, "step": 4594 }, { "epoch": 0.6285909712722298, - "grad_norm": 1.2295012719433205, + "grad_norm": 1.2256108216628563, "learning_rate": 3.0345925724524726e-06, - "loss": 0.1684, + "loss": 0.1697, "step": 4595 }, { "epoch": 0.6287277701778385, - "grad_norm": 1.1754443651863253, + "grad_norm": 1.1604823316772317, "learning_rate": 3.0326168950004963e-06, - "loss": 0.1762, + "loss": 0.1766, "step": 4596 }, { "epoch": 0.6288645690834473, - "grad_norm": 1.4470954727736132, + "grad_norm": 1.4565198437658873, "learning_rate": 3.030641580922579e-06, - "loss": 0.1896, + "loss": 0.1915, "step": 4597 }, { "epoch": 0.629001367989056, - "grad_norm": 1.222709818089464, + "grad_norm": 1.1835100413895299, "learning_rate": 3.028666630583561e-06, - "loss": 0.1929, + "loss": 0.1895, "step": 4598 }, { "epoch": 0.6291381668946648, - "grad_norm": 1.224007241783884, + "grad_norm": 1.2049941770943096, "learning_rate": 3.0266920443482145e-06, - "loss": 0.1722, + "loss": 0.1718, "step": 4599 }, { "epoch": 0.6292749658002736, - "grad_norm": 1.2630505009533992, + "grad_norm": 1.2527879309417551, "learning_rate": 3.0247178225812434e-06, - "loss": 0.1836, + "loss": 0.183, "step": 4600 }, { "epoch": 0.6292749658002736, - "eval_loss": 0.178886279463768, - "eval_runtime": 5.9139, - "eval_samples_per_second": 5.073, - "eval_steps_per_second": 1.353, + "eval_loss": 0.17863531410694122, + "eval_runtime": 5.9231, + "eval_samples_per_second": 5.065, + "eval_steps_per_second": 1.351, "step": 4600 }, { "epoch": 0.6294117647058823, - "grad_norm": 1.3161685868970368, + "grad_norm": 1.2986996000369708, "learning_rate": 3.0227439656472878e-06, - "loss": 0.2061, + "loss": 0.2084, "step": 4601 }, { "epoch": 0.6295485636114911, - "grad_norm": 1.4073017070937, + "grad_norm": 1.4184982643229533, "learning_rate": 3.0207704739109135e-06, - "loss": 0.2022, + "loss": 0.2032, "step": 4602 }, { "epoch": 0.6296853625170998, - "grad_norm": 1.5243495216288914, + "grad_norm": 1.5189216814442639, "learning_rate": 3.0187973477366256e-06, - "loss": 0.1915, + "loss": 0.1931, "step": 4603 }, { "epoch": 0.6298221614227086, - "grad_norm": 1.014409752236246, + "grad_norm": 1.0091364847352078, "learning_rate": 3.0168245874888557e-06, - "loss": 0.1617, + "loss": 0.162, "step": 4604 }, { "epoch": 0.6299589603283173, - "grad_norm": 1.2193418180781728, + "grad_norm": 1.201286723316984, "learning_rate": 3.0148521935319746e-06, - "loss": 0.1784, + "loss": 0.1782, "step": 4605 }, { "epoch": 0.6300957592339261, - "grad_norm": 1.1748424549074197, + "grad_norm": 1.148801479877321, "learning_rate": 3.012880166230279e-06, - "loss": 0.1626, + "loss": 0.163, "step": 4606 }, { "epoch": 0.6302325581395349, - "grad_norm": 1.1497843517863087, + "grad_norm": 1.1289652614168846, "learning_rate": 3.010908505948002e-06, - "loss": 0.1611, + "loss": 0.1603, "step": 4607 }, { "epoch": 0.6303693570451436, - "grad_norm": 1.218869065590767, + "grad_norm": 1.2163613745219142, "learning_rate": 3.00893721304931e-06, - "loss": 0.1705, + "loss": 0.1696, "step": 4608 }, { "epoch": 0.6305061559507524, - "grad_norm": 1.3393603132292764, + "grad_norm": 1.3307194886703666, "learning_rate": 3.006966287898294e-06, "loss": 0.1862, "step": 4609 }, { "epoch": 0.6306429548563611, - "grad_norm": 1.3102204366784358, + "grad_norm": 1.3124607710218605, "learning_rate": 3.0049957308589884e-06, - "loss": 0.1792, + "loss": 0.1788, "step": 4610 }, { "epoch": 0.6307797537619699, - "grad_norm": 1.352292914938853, + "grad_norm": 1.3014554700676113, "learning_rate": 3.003025542295347e-06, - "loss": 0.1777, + "loss": 0.1773, "step": 4611 }, { "epoch": 0.6309165526675786, - "grad_norm": 1.1156509449686365, + "grad_norm": 1.1050296012519334, "learning_rate": 3.0010557225712666e-06, - "loss": 0.1971, + "loss": 0.1978, "step": 4612 }, { "epoch": 0.6310533515731874, - "grad_norm": 1.3490943854906927, + "grad_norm": 1.3610337908971122, "learning_rate": 2.9990862720505704e-06, - "loss": 0.1826, + "loss": 0.1836, "step": 4613 }, { "epoch": 0.6311901504787961, - "grad_norm": 1.1253652540800563, + "grad_norm": 1.103226554323741, "learning_rate": 2.997117191097013e-06, - "loss": 0.1814, + "loss": 0.1817, "step": 4614 }, { "epoch": 0.6313269493844049, - "grad_norm": 1.3276373756554136, + "grad_norm": 1.3297001715957515, "learning_rate": 2.995148480074283e-06, - "loss": 0.1971, + "loss": 0.1982, "step": 4615 }, { "epoch": 0.6314637482900137, - "grad_norm": 1.1725793519060774, + "grad_norm": 1.1575909267628564, "learning_rate": 2.993180139345999e-06, - "loss": 0.1826, + "loss": 0.1824, "step": 4616 }, { "epoch": 0.6316005471956224, - "grad_norm": 1.3087263119114254, + "grad_norm": 1.3279314622833411, "learning_rate": 2.9912121692757146e-06, - "loss": 0.175, + "loss": 0.1779, "step": 4617 }, { "epoch": 0.6317373461012312, - "grad_norm": 1.356662406943105, + "grad_norm": 1.3385817853783493, "learning_rate": 2.9892445702269085e-06, - "loss": 0.1922, + "loss": 0.1918, "step": 4618 }, { "epoch": 0.6318741450068399, - "grad_norm": 1.2547887534892892, + "grad_norm": 1.2386488489125926, "learning_rate": 2.987277342562998e-06, - "loss": 0.1569, + "loss": 0.154, "step": 4619 }, { "epoch": 0.6320109439124487, - "grad_norm": 1.411505008892194, + "grad_norm": 1.42089431428086, "learning_rate": 2.9853104866473248e-06, "loss": 0.194, "step": 4620 }, { "epoch": 0.6321477428180574, - "grad_norm": 1.3179754431365396, + "grad_norm": 1.311691024672357, "learning_rate": 2.9833440028431692e-06, - "loss": 0.2216, + "loss": 0.2221, "step": 4621 }, { "epoch": 0.6322845417236662, - "grad_norm": 1.0218865835218904, + "grad_norm": 1.022518724431223, "learning_rate": 2.9813778915137364e-06, - "loss": 0.1486, + "loss": 0.1498, "step": 4622 }, { "epoch": 0.632421340629275, - "grad_norm": 1.3571786928142304, + "grad_norm": 1.344466872240332, "learning_rate": 2.9794121530221666e-06, - "loss": 0.2054, + "loss": 0.2061, "step": 4623 }, { "epoch": 0.6325581395348837, - "grad_norm": 1.246209171706476, + "grad_norm": 1.2201643692258342, "learning_rate": 2.977446787731532e-06, - "loss": 0.1784, + "loss": 0.1777, "step": 4624 }, { "epoch": 0.6326949384404925, - "grad_norm": 1.0951704663812256, + "grad_norm": 1.0994869419873683, "learning_rate": 2.975481796004831e-06, - "loss": 0.1416, + "loss": 0.1422, "step": 4625 }, { "epoch": 0.6328317373461012, - "grad_norm": 1.146457957286378, + "grad_norm": 1.1308806184334024, "learning_rate": 2.9735171782049987e-06, - "loss": 0.1573, + "loss": 0.1564, "step": 4626 }, { "epoch": 0.63296853625171, - "grad_norm": 1.2645389533270386, + "grad_norm": 1.2638131403112967, "learning_rate": 2.9715529346948946e-06, - "loss": 0.2025, + "loss": 0.203, "step": 4627 }, { "epoch": 0.6331053351573187, - "grad_norm": 1.2438833629342059, + "grad_norm": 1.2417554177083583, "learning_rate": 2.9695890658373166e-06, - "loss": 0.1735, + "loss": 0.1741, "step": 4628 }, { "epoch": 0.6332421340629275, - "grad_norm": 1.3431664331621187, + "grad_norm": 1.3229905517442115, "learning_rate": 2.9676255719949875e-06, - "loss": 0.2082, + "loss": 0.208, "step": 4629 }, { "epoch": 0.6333789329685362, - "grad_norm": 1.364321554526624, + "grad_norm": 1.3458513644441032, "learning_rate": 2.9656624535305657e-06, - "loss": 0.178, + "loss": 0.1772, "step": 4630 }, { "epoch": 0.633515731874145, - "grad_norm": 1.3475841010684526, + "grad_norm": 1.3384495207166094, "learning_rate": 2.9636997108066334e-06, - "loss": 0.1679, + "loss": 0.1668, "step": 4631 }, { "epoch": 0.6336525307797538, - "grad_norm": 1.2904836079030533, + "grad_norm": 1.2731890197765061, "learning_rate": 2.96173734418571e-06, - "loss": 0.2023, + "loss": 0.2022, "step": 4632 }, { "epoch": 0.6337893296853625, - "grad_norm": 1.2450599609379942, + "grad_norm": 1.2391347745287402, "learning_rate": 2.9597753540302443e-06, - "loss": 0.2134, + "loss": 0.2123, "step": 4633 }, { "epoch": 0.6339261285909713, - "grad_norm": 1.3308055100611396, + "grad_norm": 1.3248106988117, "learning_rate": 2.957813740702612e-06, - "loss": 0.2043, + "loss": 0.2031, "step": 4634 }, { "epoch": 0.63406292749658, - "grad_norm": 1.508557826081899, + "grad_norm": 1.5046181362829059, "learning_rate": 2.9558525045651226e-06, - "loss": 0.2506, + "loss": 0.2462, "step": 4635 }, { "epoch": 0.6341997264021888, - "grad_norm": 1.098131223665679, + "grad_norm": 1.10049945610898, "learning_rate": 2.9538916459800136e-06, - "loss": 0.1793, + "loss": 0.1808, "step": 4636 }, { "epoch": 0.6343365253077975, - "grad_norm": 1.2148574300506998, + "grad_norm": 1.1912041327465202, "learning_rate": 2.9519311653094575e-06, - "loss": 0.1926, + "loss": 0.1933, "step": 4637 }, { "epoch": 0.6344733242134063, - "grad_norm": 1.2616004403442755, + "grad_norm": 1.2451301396282253, "learning_rate": 2.9499710629155486e-06, - "loss": 0.1707, + "loss": 0.1716, "step": 4638 }, { "epoch": 0.6346101231190151, - "grad_norm": 1.5316623050884355, + "grad_norm": 1.5213371298057694, "learning_rate": 2.948011339160318e-06, - "loss": 0.2105, + "loss": 0.2114, "step": 4639 }, { "epoch": 0.6347469220246238, - "grad_norm": 1.2078955609648172, + "grad_norm": 1.387088992284501, "learning_rate": 2.9460519944057285e-06, - "loss": 0.1707, + "loss": 0.1727, "step": 4640 }, { "epoch": 0.6348837209302326, - "grad_norm": 1.1280233644114601, + "grad_norm": 1.13301394162667, "learning_rate": 2.9440930290136643e-06, - "loss": 0.1983, + "loss": 0.2008, "step": 4641 }, { "epoch": 0.6350205198358413, - "grad_norm": 1.195886776985376, + "grad_norm": 1.1942804193214054, "learning_rate": 2.942134443345949e-06, - "loss": 0.1869, + "loss": 0.1876, "step": 4642 }, { "epoch": 0.6351573187414501, - "grad_norm": 1.5405234910291172, + "grad_norm": 1.5454272500344546, "learning_rate": 2.9401762377643273e-06, - "loss": 0.2522, + "loss": 0.2548, "step": 4643 }, { "epoch": 0.6352941176470588, - "grad_norm": 1.1165042342899258, + "grad_norm": 1.098722790843912, "learning_rate": 2.9382184126304834e-06, - "loss": 0.1317, + "loss": 0.1307, "step": 4644 }, { "epoch": 0.6354309165526676, - "grad_norm": 1.3488741158860638, + "grad_norm": 1.324080916208011, "learning_rate": 2.9362609683060216e-06, - "loss": 0.212, + "loss": 0.2104, "step": 4645 }, { "epoch": 0.6355677154582763, - "grad_norm": 1.3263570946487102, + "grad_norm": 1.295315034118139, "learning_rate": 2.9343039051524834e-06, - "loss": 0.1865, + "loss": 0.1852, "step": 4646 }, { "epoch": 0.6357045143638851, - "grad_norm": 1.3292452662519485, + "grad_norm": 1.3208534434552426, "learning_rate": 2.932347223531334e-06, - "loss": 0.1736, + "loss": 0.1739, "step": 4647 }, { "epoch": 0.6358413132694939, - "grad_norm": 1.116705043973571, + "grad_norm": 1.0962326500704207, "learning_rate": 2.9303909238039716e-06, - "loss": 0.1724, + "loss": 0.1714, "step": 4648 }, { "epoch": 0.6359781121751026, - "grad_norm": 1.1402654646617179, + "grad_norm": 1.1349827771877308, "learning_rate": 2.928435006331727e-06, - "loss": 0.1581, + "loss": 0.1573, "step": 4649 }, { "epoch": 0.6361149110807114, - "grad_norm": 1.0869189107212167, + "grad_norm": 1.064494621112353, "learning_rate": 2.9264794714758504e-06, - "loss": 0.1526, + "loss": 0.1525, "step": 4650 }, { "epoch": 0.6362517099863201, - "grad_norm": 1.6400000452208878, + "grad_norm": 1.6904193797688876, "learning_rate": 2.924524319597532e-06, - "loss": 0.2133, + "loss": 0.2167, "step": 4651 }, { "epoch": 0.6363885088919289, - "grad_norm": 1.3626998058668045, + "grad_norm": 1.3590651001701515, "learning_rate": 2.9225695510578845e-06, - "loss": 0.1969, + "loss": 0.1971, "step": 4652 }, { "epoch": 0.6365253077975376, - "grad_norm": 1.6245736241379012, + "grad_norm": 1.62924879966067, "learning_rate": 2.9206151662179536e-06, - "loss": 0.2032, + "loss": 0.2027, "step": 4653 }, { "epoch": 0.6366621067031464, - "grad_norm": 1.1261611022891556, + "grad_norm": 1.1219341851775733, "learning_rate": 2.9186611654387107e-06, - "loss": 0.1956, + "loss": 0.1964, "step": 4654 }, { "epoch": 0.6367989056087552, - "grad_norm": 1.3179506336717508, + "grad_norm": 1.321765472008712, "learning_rate": 2.9167075490810603e-06, - "loss": 0.2168, + "loss": 0.2186, "step": 4655 }, { "epoch": 0.6369357045143639, - "grad_norm": 1.3036147937734432, + "grad_norm": 1.2832376081805266, "learning_rate": 2.9147543175058335e-06, - "loss": 0.1779, + "loss": 0.1745, "step": 4656 }, { "epoch": 0.6370725034199727, - "grad_norm": 1.2302308068041181, + "grad_norm": 1.2137391396044575, "learning_rate": 2.9128014710737882e-06, - "loss": 0.1937, + "loss": 0.1954, "step": 4657 }, { "epoch": 0.6372093023255814, - "grad_norm": 1.1468384136190022, + "grad_norm": 1.1275670668488242, "learning_rate": 2.910849010145617e-06, - "loss": 0.1652, + "loss": 0.165, "step": 4658 }, { "epoch": 0.6373461012311902, - "grad_norm": 1.1958764947221512, + "grad_norm": 1.1967497668707179, "learning_rate": 2.908896935081934e-06, - "loss": 0.1818, + "loss": 0.1831, "step": 4659 }, { "epoch": 0.6374829001367989, - "grad_norm": 1.2083877425832539, + "grad_norm": 1.2029823925085155, "learning_rate": 2.9069452462432883e-06, - "loss": 0.1531, + "loss": 0.1539, "step": 4660 }, { "epoch": 0.6376196990424077, - "grad_norm": 1.0975265994304486, + "grad_norm": 1.0845249915581432, "learning_rate": 2.9049939439901547e-06, - "loss": 0.1684, + "loss": 0.1671, "step": 4661 }, { "epoch": 0.6377564979480164, - "grad_norm": 1.3676434009831016, + "grad_norm": 1.3360713191776836, "learning_rate": 2.903043028682939e-06, - "loss": 0.1926, + "loss": 0.1925, "step": 4662 }, { "epoch": 0.6378932968536252, - "grad_norm": 1.3773482654829088, + "grad_norm": 1.3610417017894105, "learning_rate": 2.9010925006819703e-06, - "loss": 0.1806, + "loss": 0.1814, "step": 4663 }, { "epoch": 0.638030095759234, - "grad_norm": 1.3797850191408776, + "grad_norm": 1.373308886948593, "learning_rate": 2.899142360347511e-06, - "loss": 0.1951, + "loss": 0.194, "step": 4664 }, { "epoch": 0.6381668946648427, - "grad_norm": 1.145998755463536, + "grad_norm": 1.1358108088792533, "learning_rate": 2.897192608039753e-06, - "loss": 0.1646, + "loss": 0.1645, "step": 4665 }, { "epoch": 0.6383036935704515, - "grad_norm": 1.4546687603327892, + "grad_norm": 1.3692095956450323, "learning_rate": 2.8952432441188095e-06, - "loss": 0.2028, + "loss": 0.1995, "step": 4666 }, { "epoch": 0.6384404924760602, - "grad_norm": 1.3915823270292824, + "grad_norm": 1.3795111690779347, "learning_rate": 2.89329426894473e-06, - "loss": 0.2057, + "loss": 0.2066, "step": 4667 }, { "epoch": 0.638577291381669, - "grad_norm": 1.3542244818916198, + "grad_norm": 1.3461358031510802, "learning_rate": 2.8913456828774855e-06, - "loss": 0.1695, + "loss": 0.1709, "step": 4668 }, { "epoch": 0.6387140902872777, - "grad_norm": 1.219096953734071, + "grad_norm": 1.205066770136844, "learning_rate": 2.8893974862769825e-06, - "loss": 0.1868, + "loss": 0.1853, "step": 4669 }, { "epoch": 0.6388508891928865, - "grad_norm": 1.2863410981111558, + "grad_norm": 1.2566287651786818, "learning_rate": 2.887449679503046e-06, - "loss": 0.1816, + "loss": 0.181, "step": 4670 }, { "epoch": 0.6389876880984953, - "grad_norm": 1.4183793309971422, + "grad_norm": 1.4383606151038164, "learning_rate": 2.8855022629154373e-06, "loss": 0.1919, "step": 4671 }, { "epoch": 0.639124487004104, - "grad_norm": 1.0939295595141847, + "grad_norm": 1.084689740200428, "learning_rate": 2.883555236873845e-06, - "loss": 0.1743, + "loss": 0.1753, "step": 4672 }, { "epoch": 0.6392612859097128, - "grad_norm": 1.259306366378498, + "grad_norm": 1.2382724798105977, "learning_rate": 2.881608601737878e-06, - "loss": 0.1585, + "loss": 0.1586, "step": 4673 }, { "epoch": 0.6393980848153215, - "grad_norm": 1.613245053056614, + "grad_norm": 1.4664691579176186, "learning_rate": 2.879662357867083e-06, - "loss": 0.2415, + "loss": 0.243, "step": 4674 }, { "epoch": 0.6395348837209303, - "grad_norm": 1.285247130351729, + "grad_norm": 1.2537354704609747, "learning_rate": 2.8777165056209256e-06, - "loss": 0.193, + "loss": 0.191, "step": 4675 }, { "epoch": 0.639671682626539, - "grad_norm": 0.8876448905420614, + "grad_norm": 0.8691196736114889, "learning_rate": 2.8757710453588052e-06, - "loss": 0.1413, + "loss": 0.1403, "step": 4676 }, { "epoch": 0.6398084815321478, - "grad_norm": 1.4939739546721373, + "grad_norm": 1.4779601667252935, "learning_rate": 2.8738259774400468e-06, - "loss": 0.2296, + "loss": 0.2302, "step": 4677 }, { "epoch": 0.6399452804377564, - "grad_norm": 1.2922950909871334, + "grad_norm": 1.2756141469669662, "learning_rate": 2.8718813022239044e-06, - "loss": 0.1584, + "loss": 0.1597, "step": 4678 }, { "epoch": 0.6400820793433653, - "grad_norm": 1.174965098306698, + "grad_norm": 1.1513200192826072, "learning_rate": 2.8699370200695544e-06, - "loss": 0.1901, + "loss": 0.1887, "step": 4679 }, { "epoch": 0.640218878248974, - "grad_norm": 1.3004001220495625, + "grad_norm": 1.2958118474114977, "learning_rate": 2.8679931313361053e-06, - "loss": 0.178, + "loss": 0.1785, "step": 4680 }, { "epoch": 0.6403556771545827, - "grad_norm": 1.3271136601860982, + "grad_norm": 1.3262768287954154, "learning_rate": 2.8660496363825953e-06, - "loss": 0.21, + "loss": 0.212, "step": 4681 }, { "epoch": 0.6404924760601916, - "grad_norm": 1.4858060219413107, + "grad_norm": 1.5123274719754816, "learning_rate": 2.864106535567981e-06, - "loss": 0.1885, + "loss": 0.19, "step": 4682 }, { "epoch": 0.6406292749658002, - "grad_norm": 1.2889732478894786, + "grad_norm": 1.2868282734090326, "learning_rate": 2.8621638292511566e-06, - "loss": 0.1783, + "loss": 0.1816, "step": 4683 }, { "epoch": 0.640766073871409, - "grad_norm": 1.1570454229899243, + "grad_norm": 1.1566484023369337, "learning_rate": 2.8602215177909333e-06, - "loss": 0.2061, + "loss": 0.2053, "step": 4684 }, { "epoch": 0.6409028727770177, - "grad_norm": 1.3817034820059741, + "grad_norm": 1.354300990781212, "learning_rate": 2.858279601546059e-06, - "loss": 0.2323, + "loss": 0.2319, "step": 4685 }, { "epoch": 0.6410396716826265, - "grad_norm": 1.232407781876874, + "grad_norm": 1.2077522380045516, "learning_rate": 2.8563380808751995e-06, - "loss": 0.1743, + "loss": 0.1734, "step": 4686 }, { "epoch": 0.6411764705882353, - "grad_norm": 1.1704572218580849, + "grad_norm": 1.1737838051973526, "learning_rate": 2.8543969561369556e-06, - "loss": 0.1696, + "loss": 0.1689, "step": 4687 }, { "epoch": 0.641313269493844, - "grad_norm": 1.2453022102395597, + "grad_norm": 1.2179104444318798, "learning_rate": 2.8524562276898515e-06, - "loss": 0.193, + "loss": 0.1907, "step": 4688 }, { "epoch": 0.6414500683994528, - "grad_norm": 1.1789123796576968, + "grad_norm": 1.1697914200852662, "learning_rate": 2.850515895892336e-06, - "loss": 0.1601, + "loss": 0.1608, "step": 4689 }, { "epoch": 0.6415868673050615, - "grad_norm": 1.2897516486071057, + "grad_norm": 1.2768262723771242, "learning_rate": 2.8485759611027875e-06, - "loss": 0.1711, + "loss": 0.1707, "step": 4690 }, { "epoch": 0.6417236662106703, - "grad_norm": 1.4322424230679387, + "grad_norm": 1.3918932462103681, "learning_rate": 2.8466364236795102e-06, - "loss": 0.2414, + "loss": 0.2387, "step": 4691 }, { "epoch": 0.641860465116279, - "grad_norm": 1.306281495485571, + "grad_norm": 1.3065051935329923, "learning_rate": 2.8446972839807384e-06, - "loss": 0.2164, + "loss": 0.2185, "step": 4692 }, { "epoch": 0.6419972640218878, - "grad_norm": 1.4613181508101125, + "grad_norm": 1.4681471327526656, "learning_rate": 2.8427585423646243e-06, - "loss": 0.2564, + "loss": 0.2554, "step": 4693 }, { "epoch": 0.6421340629274965, - "grad_norm": 1.0650842306513908, + "grad_norm": 1.055241171141442, "learning_rate": 2.840820199189257e-06, - "loss": 0.1697, + "loss": 0.1691, "step": 4694 }, { "epoch": 0.6422708618331053, - "grad_norm": 1.2596233213903545, + "grad_norm": 1.252137086643019, "learning_rate": 2.838882254812643e-06, - "loss": 0.2007, + "loss": 0.2008, "step": 4695 }, { "epoch": 0.6424076607387141, - "grad_norm": 1.4103616991590213, + "grad_norm": 1.3877885970938717, "learning_rate": 2.8369447095927195e-06, - "loss": 0.2027, + "loss": 0.2021, "step": 4696 }, { "epoch": 0.6425444596443228, - "grad_norm": 1.3987427884906427, + "grad_norm": 1.3757889048288754, "learning_rate": 2.8350075638873525e-06, - "loss": 0.1801, + "loss": 0.1772, "step": 4697 }, { "epoch": 0.6426812585499316, - "grad_norm": 1.081864549510789, + "grad_norm": 1.047713462041298, "learning_rate": 2.8330708180543277e-06, - "loss": 0.1864, + "loss": 0.1839, "step": 4698 }, { "epoch": 0.6428180574555403, - "grad_norm": 1.2734032873439207, + "grad_norm": 1.2673055199397163, "learning_rate": 2.8311344724513635e-06, "loss": 0.197, "step": 4699 }, { "epoch": 0.6429548563611491, - "grad_norm": 1.347007485762506, + "grad_norm": 1.348392188515362, "learning_rate": 2.8291985274360985e-06, - "loss": 0.2285, + "loss": 0.2281, "step": 4700 }, { "epoch": 0.6429548563611491, - "eval_loss": 0.1791599541902542, - "eval_runtime": 5.9049, - "eval_samples_per_second": 5.081, - "eval_steps_per_second": 1.355, + "eval_loss": 0.17918221652507782, + "eval_runtime": 5.9196, + "eval_samples_per_second": 5.068, + "eval_steps_per_second": 1.351, "step": 4700 }, { "epoch": 0.6430916552667578, - "grad_norm": 1.2273825422102573, + "grad_norm": 1.2268563404178876, "learning_rate": 2.827262983366103e-06, - "loss": 0.1823, + "loss": 0.1839, "step": 4701 }, { "epoch": 0.6432284541723666, - "grad_norm": 1.0141468205758428, + "grad_norm": 1.0108955785539946, "learning_rate": 2.825327840598867e-06, - "loss": 0.1591, + "loss": 0.1587, "step": 4702 }, { "epoch": 0.6433652530779754, - "grad_norm": 0.8795381778957619, + "grad_norm": 0.8708915114721884, "learning_rate": 2.8233930994918123e-06, - "loss": 0.1147, + "loss": 0.1144, "step": 4703 }, { "epoch": 0.6435020519835841, - "grad_norm": 1.3984239562341296, + "grad_norm": 1.3903145566191515, "learning_rate": 2.8214587604022847e-06, - "loss": 0.169, + "loss": 0.1698, "step": 4704 }, { "epoch": 0.6436388508891929, - "grad_norm": 1.0430337618303496, + "grad_norm": 1.042130653265086, "learning_rate": 2.819524823687553e-06, "loss": 0.1586, "step": 4705 }, { "epoch": 0.6437756497948016, - "grad_norm": 1.32048707365888, + "grad_norm": 1.2023799040784102, "learning_rate": 2.817591289704813e-06, - "loss": 0.2095, + "loss": 0.2069, "step": 4706 }, { "epoch": 0.6439124487004104, - "grad_norm": 1.2653712924505454, + "grad_norm": 1.2609762711692454, "learning_rate": 2.81565815881119e-06, - "loss": 0.2117, + "loss": 0.2116, "step": 4707 }, { "epoch": 0.6440492476060191, - "grad_norm": 1.0230467905651757, + "grad_norm": 0.9716157768814238, "learning_rate": 2.8137254313637306e-06, - "loss": 0.1541, + "loss": 0.1494, "step": 4708 }, { "epoch": 0.6441860465116279, - "grad_norm": 1.1524535969324847, + "grad_norm": 1.1262998781452151, "learning_rate": 2.8117931077194067e-06, - "loss": 0.1465, + "loss": 0.1459, "step": 4709 }, { "epoch": 0.6443228454172366, - "grad_norm": 0.9553832358542752, + "grad_norm": 0.9553400417587845, "learning_rate": 2.809861188235119e-06, - "loss": 0.1224, + "loss": 0.1232, "step": 4710 }, { "epoch": 0.6444596443228454, - "grad_norm": 1.4224598283217444, + "grad_norm": 1.4221282287607935, "learning_rate": 2.8079296732676885e-06, - "loss": 0.2064, + "loss": 0.2055, "step": 4711 }, { "epoch": 0.6445964432284542, - "grad_norm": 1.3435201003778823, + "grad_norm": 1.317893614572662, "learning_rate": 2.805998563173866e-06, - "loss": 0.1753, + "loss": 0.1755, "step": 4712 }, { "epoch": 0.6447332421340629, - "grad_norm": 1.3574637185738214, + "grad_norm": 1.3867448054587517, "learning_rate": 2.8040678583103288e-06, - "loss": 0.1626, + "loss": 0.1637, "step": 4713 }, { "epoch": 0.6448700410396717, - "grad_norm": 1.0719497610339355, + "grad_norm": 1.0322662003809049, "learning_rate": 2.8021375590336706e-06, - "loss": 0.1548, + "loss": 0.1531, "step": 4714 }, { "epoch": 0.6450068399452804, - "grad_norm": 1.193128690948642, + "grad_norm": 1.1624719845860094, "learning_rate": 2.800207665700422e-06, - "loss": 0.1835, + "loss": 0.1833, "step": 4715 }, { "epoch": 0.6451436388508892, - "grad_norm": 1.1911231677673957, + "grad_norm": 1.1729152942325298, "learning_rate": 2.798278178667028e-06, - "loss": 0.19, + "loss": 0.1914, "step": 4716 }, { "epoch": 0.6452804377564979, - "grad_norm": 1.1933389814956485, + "grad_norm": 1.1761541483535145, "learning_rate": 2.7963490982898676e-06, - "loss": 0.2082, + "loss": 0.2079, "step": 4717 }, { "epoch": 0.6454172366621067, - "grad_norm": 1.2194878474332032, + "grad_norm": 1.2007110760994937, "learning_rate": 2.794420424925236e-06, - "loss": 0.1555, + "loss": 0.1548, "step": 4718 }, { "epoch": 0.6455540355677155, - "grad_norm": 1.56787043793806, + "grad_norm": 1.5633098788690802, "learning_rate": 2.7924921589293596e-06, - "loss": 0.2352, + "loss": 0.236, "step": 4719 }, { "epoch": 0.6456908344733242, - "grad_norm": 1.3391587158719431, + "grad_norm": 1.384315271002874, "learning_rate": 2.790564300658387e-06, - "loss": 0.1886, + "loss": 0.19, "step": 4720 }, { "epoch": 0.645827633378933, - "grad_norm": 1.0057356168497311, + "grad_norm": 1.0099154441646647, "learning_rate": 2.7886368504683935e-06, - "loss": 0.1566, + "loss": 0.1592, "step": 4721 }, { "epoch": 0.6459644322845417, - "grad_norm": 1.2207734267167747, + "grad_norm": 1.1941380443946892, "learning_rate": 2.786709808715379e-06, - "loss": 0.2015, + "loss": 0.2006, "step": 4722 }, { "epoch": 0.6461012311901505, - "grad_norm": 1.106633590848504, + "grad_norm": 1.1543073334241087, "learning_rate": 2.7847831757552625e-06, - "loss": 0.1474, + "loss": 0.1495, "step": 4723 }, { "epoch": 0.6462380300957592, - "grad_norm": 1.1646346633125633, + "grad_norm": 1.1541222443841748, "learning_rate": 2.782856951943894e-06, - "loss": 0.186, + "loss": 0.1858, "step": 4724 }, { "epoch": 0.646374829001368, - "grad_norm": 1.6932294529433458, + "grad_norm": 1.682973836087115, "learning_rate": 2.780931137637044e-06, - "loss": 0.2425, + "loss": 0.2445, "step": 4725 }, { "epoch": 0.6465116279069767, - "grad_norm": 1.2470763062502117, + "grad_norm": 1.248059209363102, "learning_rate": 2.779005733190412e-06, - "loss": 0.1912, + "loss": 0.1924, "step": 4726 }, { "epoch": 0.6466484268125855, - "grad_norm": 1.1760513239311918, + "grad_norm": 1.1607208712640231, "learning_rate": 2.7770807389596136e-06, - "loss": 0.1618, + "loss": 0.1626, "step": 4727 }, { "epoch": 0.6467852257181943, - "grad_norm": 1.2414045552475432, + "grad_norm": 1.2278941293079324, "learning_rate": 2.775156155300197e-06, - "loss": 0.1779, + "loss": 0.1778, "step": 4728 }, { "epoch": 0.646922024623803, - "grad_norm": 1.2844537436748262, + "grad_norm": 1.2500074852283116, "learning_rate": 2.7732319825676325e-06, - "loss": 0.2018, + "loss": 0.2034, "step": 4729 }, { "epoch": 0.6470588235294118, - "grad_norm": 1.3860687447603341, + "grad_norm": 1.3632179164526088, "learning_rate": 2.771308221117309e-06, - "loss": 0.1983, + "loss": 0.1981, "step": 4730 }, { "epoch": 0.6471956224350205, - "grad_norm": 1.388386787049725, + "grad_norm": 1.3769405703352198, "learning_rate": 2.7693848713045486e-06, - "loss": 0.2056, + "loss": 0.2081, "step": 4731 }, { "epoch": 0.6473324213406293, - "grad_norm": 1.4475773369171814, + "grad_norm": 1.412130121478495, "learning_rate": 2.7674619334845876e-06, - "loss": 0.1994, + "loss": 0.1969, "step": 4732 }, { "epoch": 0.647469220246238, - "grad_norm": 1.3898277505121701, + "grad_norm": 1.3678664825608209, "learning_rate": 2.7655394080125935e-06, - "loss": 0.1889, + "loss": 0.1915, "step": 4733 }, { "epoch": 0.6476060191518468, - "grad_norm": 1.1691167164470049, + "grad_norm": 1.146095713459494, "learning_rate": 2.763617295243657e-06, - "loss": 0.1609, + "loss": 0.16, "step": 4734 }, { "epoch": 0.6477428180574556, - "grad_norm": 1.494147358568718, + "grad_norm": 1.469117010977461, "learning_rate": 2.7616955955327875e-06, - "loss": 0.2258, + "loss": 0.2268, "step": 4735 }, { "epoch": 0.6478796169630643, - "grad_norm": 1.2436665190396439, + "grad_norm": 1.225615365570979, "learning_rate": 2.7597743092349217e-06, - "loss": 0.1795, + "loss": 0.179, "step": 4736 }, { "epoch": 0.6480164158686731, - "grad_norm": 1.403306510203395, + "grad_norm": 1.402689916247632, "learning_rate": 2.7578534367049215e-06, - "loss": 0.2178, + "loss": 0.22, "step": 4737 }, { "epoch": 0.6481532147742818, - "grad_norm": 1.3160797661258352, + "grad_norm": 1.3069790871397513, "learning_rate": 2.755932978297571e-06, - "loss": 0.2143, + "loss": 0.2144, "step": 4738 }, { "epoch": 0.6482900136798906, - "grad_norm": 1.41596378032996, + "grad_norm": 1.439391210933022, "learning_rate": 2.7540129343675737e-06, - "loss": 0.2257, + "loss": 0.227, "step": 4739 }, { "epoch": 0.6484268125854993, - "grad_norm": 1.0096348695456132, + "grad_norm": 0.9858097679022728, "learning_rate": 2.7520933052695653e-06, - "loss": 0.1675, + "loss": 0.1686, "step": 4740 }, { "epoch": 0.6485636114911081, - "grad_norm": 1.3128064010641998, + "grad_norm": 1.301095953853261, "learning_rate": 2.7501740913580943e-06, - "loss": 0.2052, + "loss": 0.2065, "step": 4741 }, { "epoch": 0.6487004103967168, - "grad_norm": 1.1858425607177074, + "grad_norm": 1.1751631072459112, "learning_rate": 2.7482552929876426e-06, - "loss": 0.1935, + "loss": 0.1934, "step": 4742 }, { "epoch": 0.6488372093023256, - "grad_norm": 1.395175637953419, + "grad_norm": 1.3755244976372787, "learning_rate": 2.746336910512606e-06, - "loss": 0.2149, + "loss": 0.2165, "step": 4743 }, { "epoch": 0.6489740082079344, - "grad_norm": 1.1977028747165477, + "grad_norm": 1.1737655219126117, "learning_rate": 2.7444189442873115e-06, - "loss": 0.1966, + "loss": 0.1951, "step": 4744 }, { "epoch": 0.6491108071135431, - "grad_norm": 1.0098765328604393, + "grad_norm": 0.9989002755390102, "learning_rate": 2.7425013946660074e-06, - "loss": 0.182, + "loss": 0.1808, "step": 4745 }, { "epoch": 0.6492476060191519, - "grad_norm": 0.9663827258047879, + "grad_norm": 0.9596501726160798, "learning_rate": 2.7405842620028583e-06, - "loss": 0.1417, + "loss": 0.1412, "step": 4746 }, { "epoch": 0.6493844049247606, - "grad_norm": 1.2833672251607269, + "grad_norm": 1.2697312884861176, "learning_rate": 2.738667546651963e-06, - "loss": 0.1652, + "loss": 0.1657, "step": 4747 }, { "epoch": 0.6495212038303694, - "grad_norm": 1.1223473666879247, + "grad_norm": 1.1050485485751957, "learning_rate": 2.7367512489673315e-06, - "loss": 0.1923, + "loss": 0.1905, "step": 4748 }, { "epoch": 0.6496580027359781, - "grad_norm": 1.3891395740437325, + "grad_norm": 1.3841437442368627, "learning_rate": 2.734835369302906e-06, - "loss": 0.1868, + "loss": 0.1882, "step": 4749 }, { "epoch": 0.6497948016415869, - "grad_norm": 1.4337092925454158, + "grad_norm": 1.399564247591895, "learning_rate": 2.7329199080125466e-06, - "loss": 0.2297, + "loss": 0.2274, "step": 4750 }, { "epoch": 0.6499316005471957, - "grad_norm": 1.0014654480692808, + "grad_norm": 1.0000564593129162, "learning_rate": 2.7310048654500387e-06, - "loss": 0.1552, + "loss": 0.1564, "step": 4751 }, { "epoch": 0.6500683994528044, - "grad_norm": 1.2284139014422095, + "grad_norm": 1.2212902754768014, "learning_rate": 2.7290902419690895e-06, - "loss": 0.16, + "loss": 0.1625, "step": 4752 }, { "epoch": 0.6502051983584132, - "grad_norm": 1.3046238546757898, + "grad_norm": 1.2805856299266611, "learning_rate": 2.7271760379233258e-06, - "loss": 0.1895, + "loss": 0.1866, "step": 4753 }, { "epoch": 0.6503419972640219, - "grad_norm": 1.1281560332782723, + "grad_norm": 1.109687294865886, "learning_rate": 2.725262253666303e-06, - "loss": 0.1642, + "loss": 0.1632, "step": 4754 }, { "epoch": 0.6504787961696307, - "grad_norm": 1.0933151246825508, + "grad_norm": 1.0747123646357208, "learning_rate": 2.723348889551492e-06, "loss": 0.1669, "step": 4755 }, { "epoch": 0.6506155950752394, - "grad_norm": 1.0321371882033987, + "grad_norm": 1.0204066931144404, "learning_rate": 2.7214359459322925e-06, - "loss": 0.1551, + "loss": 0.155, "step": 4756 }, { "epoch": 0.6507523939808482, - "grad_norm": 1.2586573821071392, + "grad_norm": 1.2291356360385537, "learning_rate": 2.7195234231620204e-06, - "loss": 0.1851, + "loss": 0.1844, "step": 4757 }, { "epoch": 0.6508891928864569, - "grad_norm": 1.3708211509975374, + "grad_norm": 1.3565368854698217, "learning_rate": 2.7176113215939216e-06, - "loss": 0.2491, + "loss": 0.2512, "step": 4758 }, { "epoch": 0.6510259917920657, - "grad_norm": 1.3335706517809909, + "grad_norm": 1.3439126383222773, "learning_rate": 2.7156996415811544e-06, - "loss": 0.1927, + "loss": 0.1954, "step": 4759 }, { "epoch": 0.6511627906976745, - "grad_norm": 1.1413523948246744, + "grad_norm": 1.1215564661909412, "learning_rate": 2.7137883834768076e-06, - "loss": 0.1429, + "loss": 0.1424, "step": 4760 }, { "epoch": 0.6512995896032832, - "grad_norm": 1.28515985151741, + "grad_norm": 1.2656103029814874, "learning_rate": 2.711877547633891e-06, - "loss": 0.182, + "loss": 0.1838, "step": 4761 }, { "epoch": 0.651436388508892, - "grad_norm": 1.2587197608368894, + "grad_norm": 1.284937531853387, "learning_rate": 2.7099671344053293e-06, - "loss": 0.1829, + "loss": 0.1846, "step": 4762 }, { "epoch": 0.6515731874145007, - "grad_norm": 1.103456707882836, + "grad_norm": 1.0869449230724533, "learning_rate": 2.7080571441439773e-06, - "loss": 0.1708, + "loss": 0.1712, "step": 4763 }, { "epoch": 0.6517099863201095, - "grad_norm": 1.520567894338906, + "grad_norm": 1.493717246981272, "learning_rate": 2.7061475772026085e-06, - "loss": 0.2323, + "loss": 0.2339, "step": 4764 }, { "epoch": 0.6518467852257182, - "grad_norm": 1.3894385285201287, + "grad_norm": 1.3793778579021534, "learning_rate": 2.70423843393392e-06, - "loss": 0.2218, + "loss": 0.2242, "step": 4765 }, { "epoch": 0.651983584131327, - "grad_norm": 1.1361566478734269, + "grad_norm": 1.1252917756350374, "learning_rate": 2.7023297146905258e-06, - "loss": 0.1684, + "loss": 0.1704, "step": 4766 }, { "epoch": 0.6521203830369358, - "grad_norm": 1.235155334479272, + "grad_norm": 1.2334513808598493, "learning_rate": 2.7004214198249655e-06, - "loss": 0.188, + "loss": 0.1855, "step": 4767 }, { "epoch": 0.6522571819425445, - "grad_norm": 1.4236994821785411, + "grad_norm": 1.3883128221584824, "learning_rate": 2.698513549689703e-06, - "loss": 0.2034, + "loss": 0.2035, "step": 4768 }, { "epoch": 0.6523939808481533, - "grad_norm": 1.3074882959855099, + "grad_norm": 1.2914614261114261, "learning_rate": 2.696606104637116e-06, - "loss": 0.2006, + "loss": 0.2009, "step": 4769 }, { "epoch": 0.652530779753762, - "grad_norm": 1.5270756586402212, + "grad_norm": 1.495429072549767, "learning_rate": 2.6946990850195107e-06, - "loss": 0.2574, + "loss": 0.256, "step": 4770 }, { "epoch": 0.6526675786593708, - "grad_norm": 1.1423634606348483, + "grad_norm": 1.1246840179156397, "learning_rate": 2.69279249118911e-06, - "loss": 0.1785, + "loss": 0.1797, "step": 4771 }, { "epoch": 0.6528043775649794, - "grad_norm": 1.259831069094572, + "grad_norm": 1.2433985280419166, "learning_rate": 2.690886323498064e-06, - "loss": 0.1761, + "loss": 0.1763, "step": 4772 }, { "epoch": 0.6529411764705882, - "grad_norm": 1.1663311875679951, + "grad_norm": 1.160012173258087, "learning_rate": 2.6889805822984348e-06, - "loss": 0.1983, + "loss": 0.1971, "step": 4773 }, { "epoch": 0.6530779753761969, - "grad_norm": 1.245034841630952, + "grad_norm": 1.2392109067668373, "learning_rate": 2.6870752679422164e-06, - "loss": 0.2107, + "loss": 0.2125, "step": 4774 }, { "epoch": 0.6532147742818057, - "grad_norm": 1.274626928536001, + "grad_norm": 1.2639040196961864, "learning_rate": 2.6851703807813145e-06, - "loss": 0.2251, + "loss": 0.2275, "step": 4775 }, { "epoch": 0.6533515731874145, - "grad_norm": 1.2316905200871289, + "grad_norm": 1.218590688599646, "learning_rate": 2.6832659211675627e-06, - "loss": 0.2036, + "loss": 0.2033, "step": 4776 }, { "epoch": 0.6534883720930232, - "grad_norm": 1.612724788997602, + "grad_norm": 1.5999703074086502, "learning_rate": 2.6813618894527137e-06, - "loss": 0.2497, + "loss": 0.2522, "step": 4777 }, { "epoch": 0.653625170998632, - "grad_norm": 1.1897970884964755, + "grad_norm": 1.1778148098978143, "learning_rate": 2.679458285988439e-06, - "loss": 0.1668, + "loss": 0.1663, "step": 4778 }, { "epoch": 0.6537619699042407, - "grad_norm": 0.9709789082760417, + "grad_norm": 0.970152393973886, "learning_rate": 2.677555111126332e-06, "loss": 0.1356, "step": 4779 }, { "epoch": 0.6538987688098495, - "grad_norm": 1.150319216421453, + "grad_norm": 1.1471233010754127, "learning_rate": 2.67565236521791e-06, - "loss": 0.1794, + "loss": 0.1812, "step": 4780 }, { "epoch": 0.6540355677154582, - "grad_norm": 1.3213000377964739, + "grad_norm": 1.310341209207577, "learning_rate": 2.6737500486146086e-06, - "loss": 0.1868, + "loss": 0.1874, "step": 4781 }, { "epoch": 0.654172366621067, - "grad_norm": 1.2340816079518193, + "grad_norm": 1.232821478345879, "learning_rate": 2.671848161667782e-06, - "loss": 0.1686, + "loss": 0.1691, "step": 4782 }, { "epoch": 0.6543091655266758, - "grad_norm": 1.49331636672002, + "grad_norm": 1.4858770073958039, "learning_rate": 2.6699467047287063e-06, - "loss": 0.1876, + "loss": 0.1875, "step": 4783 }, { "epoch": 0.6544459644322845, - "grad_norm": 1.5214855999205217, + "grad_norm": 1.5101089237138323, "learning_rate": 2.668045678148584e-06, - "loss": 0.2147, + "loss": 0.2119, "step": 4784 }, { "epoch": 0.6545827633378933, - "grad_norm": 1.1811215879703734, + "grad_norm": 1.165941846668799, "learning_rate": 2.6661450822785286e-06, - "loss": 0.177, + "loss": 0.1779, "step": 4785 }, { "epoch": 0.654719562243502, - "grad_norm": 1.1756417141183102, + "grad_norm": 1.1541757000501267, "learning_rate": 2.664244917469581e-06, - "loss": 0.1867, + "loss": 0.1871, "step": 4786 }, { "epoch": 0.6548563611491108, - "grad_norm": 1.3691350564865825, + "grad_norm": 1.34083703477141, "learning_rate": 2.6623451840726978e-06, - "loss": 0.1716, + "loss": 0.1695, "step": 4787 }, { "epoch": 0.6549931600547195, - "grad_norm": 1.0943392484375287, + "grad_norm": 1.0848068689236638, "learning_rate": 2.660445882438761e-06, - "loss": 0.1597, + "loss": 0.1602, "step": 4788 }, { "epoch": 0.6551299589603283, - "grad_norm": 1.1282309242190376, + "grad_norm": 1.1304889977625157, "learning_rate": 2.6585470129185677e-06, - "loss": 0.16, + "loss": 0.1607, "step": 4789 }, { "epoch": 0.655266757865937, - "grad_norm": 1.5297656209913573, + "grad_norm": 1.5204090760935134, "learning_rate": 2.6566485758628397e-06, - "loss": 0.1947, + "loss": 0.1958, "step": 4790 }, { "epoch": 0.6554035567715458, - "grad_norm": 1.2735262708731645, + "grad_norm": 1.2654158121409085, "learning_rate": 2.654750571622214e-06, - "loss": 0.1966, + "loss": 0.1963, "step": 4791 }, { "epoch": 0.6555403556771546, - "grad_norm": 1.2353367572219283, + "grad_norm": 1.2305465929369819, "learning_rate": 2.652853000547252e-06, - "loss": 0.1804, + "loss": 0.1816, "step": 4792 }, { "epoch": 0.6556771545827633, - "grad_norm": 1.2057581489281304, + "grad_norm": 1.2007506977350983, "learning_rate": 2.6509558629884335e-06, - "loss": 0.1751, + "loss": 0.1754, "step": 4793 }, { "epoch": 0.6558139534883721, - "grad_norm": 1.103389170462134, + "grad_norm": 1.0850756331019713, "learning_rate": 2.649059159296158e-06, - "loss": 0.1231, + "loss": 0.1228, "step": 4794 }, { "epoch": 0.6559507523939808, - "grad_norm": 0.9489823516799182, + "grad_norm": 0.9437362983227771, "learning_rate": 2.6471628898207476e-06, "loss": 0.1526, "step": 4795 }, { "epoch": 0.6560875512995896, - "grad_norm": 1.1860759054584895, + "grad_norm": 1.161024839630608, "learning_rate": 2.6452670549124375e-06, - "loss": 0.1684, + "loss": 0.1672, "step": 4796 }, { "epoch": 0.6562243502051983, - "grad_norm": 1.3628987761189064, + "grad_norm": 1.338981574223486, "learning_rate": 2.6433716549213917e-06, - "loss": 0.208, + "loss": 0.2079, "step": 4797 }, { "epoch": 0.6563611491108071, - "grad_norm": 1.489702918497557, + "grad_norm": 1.4593400361363682, "learning_rate": 2.641476690197683e-06, - "loss": 0.2059, + "loss": 0.2045, "step": 4798 }, { "epoch": 0.6564979480164159, - "grad_norm": 1.2414455745986925, + "grad_norm": 1.2338745146020456, "learning_rate": 2.639582161091314e-06, - "loss": 0.1435, + "loss": 0.1442, "step": 4799 }, { "epoch": 0.6566347469220246, - "grad_norm": 1.3461745467859771, + "grad_norm": 1.3128051541002053, "learning_rate": 2.637688067952204e-06, - "loss": 0.1814, + "loss": 0.1815, "step": 4800 }, { "epoch": 0.6566347469220246, - "eval_loss": 0.17766152322292328, - "eval_runtime": 5.9289, - "eval_samples_per_second": 5.06, - "eval_steps_per_second": 1.349, + "eval_loss": 0.17796623706817627, + "eval_runtime": 5.9336, + "eval_samples_per_second": 5.056, + "eval_steps_per_second": 1.348, "step": 4800 }, { "epoch": 0.6567715458276334, - "grad_norm": 1.3296558300787757, + "grad_norm": 1.3207314028860746, "learning_rate": 2.635794411130186e-06, - "loss": 0.2015, + "loss": 0.2, "step": 4801 }, { "epoch": 0.6569083447332421, - "grad_norm": 0.940215442918164, + "grad_norm": 0.9392611086443543, "learning_rate": 2.633901190975021e-06, - "loss": 0.1845, + "loss": 0.1859, "step": 4802 }, { "epoch": 0.6570451436388509, - "grad_norm": 1.273371610375629, + "grad_norm": 1.2537019943122012, "learning_rate": 2.6320084078363816e-06, - "loss": 0.1881, + "loss": 0.1882, "step": 4803 }, { "epoch": 0.6571819425444596, - "grad_norm": 1.226984792301828, + "grad_norm": 1.2227923687777227, "learning_rate": 2.630116062063867e-06, - "loss": 0.1898, + "loss": 0.1904, "step": 4804 }, { "epoch": 0.6573187414500684, - "grad_norm": 1.2537916355773424, + "grad_norm": 1.246181785981904, "learning_rate": 2.6282241540069887e-06, - "loss": 0.1757, + "loss": 0.1745, "step": 4805 }, { "epoch": 0.6574555403556771, - "grad_norm": 1.362110191683818, + "grad_norm": 1.3516451577467872, "learning_rate": 2.626332684015181e-06, - "loss": 0.2022, + "loss": 0.201, "step": 4806 }, { "epoch": 0.6575923392612859, - "grad_norm": 1.1527986247020663, + "grad_norm": 1.1351271411912713, "learning_rate": 2.6244416524378e-06, - "loss": 0.1858, + "loss": 0.1837, "step": 4807 }, { "epoch": 0.6577291381668947, - "grad_norm": 1.4162635436972046, + "grad_norm": 1.3869292493638488, "learning_rate": 2.6225510596241135e-06, - "loss": 0.2097, + "loss": 0.2099, "step": 4808 }, { "epoch": 0.6578659370725034, - "grad_norm": 1.1872279210390577, + "grad_norm": 1.1947267438018543, "learning_rate": 2.6206609059233147e-06, - "loss": 0.1834, + "loss": 0.1835, "step": 4809 }, { "epoch": 0.6580027359781122, - "grad_norm": 1.215492244334972, + "grad_norm": 1.1998895647720993, "learning_rate": 2.6187711916845128e-06, - "loss": 0.1774, + "loss": 0.1768, "step": 4810 }, { "epoch": 0.6581395348837209, - "grad_norm": 1.4083686018566857, + "grad_norm": 1.3773603926475542, "learning_rate": 2.6168819172567393e-06, - "loss": 0.1875, + "loss": 0.1856, "step": 4811 }, { "epoch": 0.6582763337893297, - "grad_norm": 1.2359616564511857, + "grad_norm": 1.2378205363025903, "learning_rate": 2.614993082988937e-06, - "loss": 0.1755, + "loss": 0.1766, "step": 4812 }, { "epoch": 0.6584131326949384, - "grad_norm": 1.0140615539891344, + "grad_norm": 1.0119197946361884, "learning_rate": 2.613104689229976e-06, - "loss": 0.1677, + "loss": 0.1685, "step": 4813 }, { "epoch": 0.6585499316005472, - "grad_norm": 1.1568992823935584, + "grad_norm": 1.163851883389548, "learning_rate": 2.611216736328638e-06, - "loss": 0.1786, + "loss": 0.1785, "step": 4814 }, { "epoch": 0.658686730506156, - "grad_norm": 1.2363121515524245, + "grad_norm": 1.218140498521893, "learning_rate": 2.609329224633627e-06, - "loss": 0.1621, + "loss": 0.161, "step": 4815 }, { "epoch": 0.6588235294117647, - "grad_norm": 1.4465089894856122, + "grad_norm": 1.4321682321163132, "learning_rate": 2.607442154493568e-06, - "loss": 0.2196, + "loss": 0.2212, "step": 4816 }, { "epoch": 0.6589603283173735, - "grad_norm": 1.230208157374261, + "grad_norm": 1.2270688397269753, "learning_rate": 2.605555526256997e-06, - "loss": 0.1964, + "loss": 0.1969, "step": 4817 }, { "epoch": 0.6590971272229822, - "grad_norm": 1.3527935173842092, + "grad_norm": 1.3266127954511058, "learning_rate": 2.603669340272377e-06, - "loss": 0.2257, + "loss": 0.227, "step": 4818 }, { "epoch": 0.659233926128591, - "grad_norm": 1.1339465959794301, + "grad_norm": 1.131786503809923, "learning_rate": 2.60178359688808e-06, - "loss": 0.159, + "loss": 0.1577, "step": 4819 }, { "epoch": 0.6593707250341997, - "grad_norm": 1.1892753160444498, + "grad_norm": 1.1785625344933879, "learning_rate": 2.5998982964524063e-06, - "loss": 0.1714, + "loss": 0.1721, "step": 4820 }, { "epoch": 0.6595075239398085, - "grad_norm": 1.1602247636021163, + "grad_norm": 1.1594238523062694, "learning_rate": 2.598013439313565e-06, - "loss": 0.1573, + "loss": 0.1576, "step": 4821 }, { "epoch": 0.6596443228454172, - "grad_norm": 1.1797483129874218, + "grad_norm": 1.168182481690021, "learning_rate": 2.596129025819689e-06, - "loss": 0.1691, + "loss": 0.1693, "step": 4822 }, { "epoch": 0.659781121751026, - "grad_norm": 1.1785918702237266, + "grad_norm": 1.1702876753313223, "learning_rate": 2.594245056318829e-06, - "loss": 0.1695, + "loss": 0.1711, "step": 4823 }, { "epoch": 0.6599179206566348, - "grad_norm": 1.295359673549927, + "grad_norm": 1.2786594444737753, "learning_rate": 2.5923615311589524e-06, - "loss": 0.1718, + "loss": 0.1715, "step": 4824 }, { "epoch": 0.6600547195622435, - "grad_norm": 1.1860649781768307, + "grad_norm": 1.1671927668865179, "learning_rate": 2.5904784506879466e-06, - "loss": 0.1609, + "loss": 0.1588, "step": 4825 }, { "epoch": 0.6601915184678523, - "grad_norm": 1.2425399759932645, + "grad_norm": 1.2500548240431373, "learning_rate": 2.5885958152536104e-06, - "loss": 0.1805, + "loss": 0.1813, "step": 4826 }, { "epoch": 0.660328317373461, - "grad_norm": 1.3436310229101, + "grad_norm": 1.3366298467220943, "learning_rate": 2.5867136252036707e-06, - "loss": 0.1791, + "loss": 0.1801, "step": 4827 }, { "epoch": 0.6604651162790698, - "grad_norm": 1.3151558014205664, + "grad_norm": 1.3102970828016074, "learning_rate": 2.584831880885761e-06, - "loss": 0.1518, + "loss": 0.1526, "step": 4828 }, { "epoch": 0.6606019151846785, - "grad_norm": 1.1687363579051147, + "grad_norm": 1.141357657756503, "learning_rate": 2.582950582647442e-06, - "loss": 0.1792, + "loss": 0.1767, "step": 4829 }, { "epoch": 0.6607387140902873, - "grad_norm": 1.3220000747907668, + "grad_norm": 1.311958175637211, "learning_rate": 2.5810697308361853e-06, - "loss": 0.1795, + "loss": 0.1788, "step": 4830 }, { "epoch": 0.6608755129958961, - "grad_norm": 1.1051872485209702, + "grad_norm": 1.091785321284391, "learning_rate": 2.5791893257993835e-06, - "loss": 0.1651, + "loss": 0.1647, "step": 4831 }, { "epoch": 0.6610123119015048, - "grad_norm": 1.3181061361390378, + "grad_norm": 1.3228272654296274, "learning_rate": 2.5773093678843474e-06, - "loss": 0.2272, + "loss": 0.2285, "step": 4832 }, { "epoch": 0.6611491108071136, - "grad_norm": 1.245773376808545, + "grad_norm": 1.207799355932776, "learning_rate": 2.5754298574383007e-06, - "loss": 0.1926, + "loss": 0.1936, "step": 4833 }, { "epoch": 0.6612859097127223, - "grad_norm": 1.4225973283081208, + "grad_norm": 1.4134913258185244, "learning_rate": 2.5735507948083916e-06, - "loss": 0.1878, + "loss": 0.1888, "step": 4834 }, { "epoch": 0.6614227086183311, - "grad_norm": 1.5216841221940738, + "grad_norm": 1.5035281065785528, "learning_rate": 2.5716721803416768e-06, - "loss": 0.2325, + "loss": 0.2318, "step": 4835 }, { "epoch": 0.6615595075239398, - "grad_norm": 0.9796702476110964, + "grad_norm": 0.9671347537722482, "learning_rate": 2.5697940143851375e-06, - "loss": 0.1374, + "loss": 0.1386, "step": 4836 }, { "epoch": 0.6616963064295486, - "grad_norm": 1.097191564982397, + "grad_norm": 1.0952193949229312, "learning_rate": 2.567916297285669e-06, - "loss": 0.1926, + "loss": 0.1914, "step": 4837 }, { "epoch": 0.6618331053351573, - "grad_norm": 1.085805016706396, + "grad_norm": 1.0828082620396529, "learning_rate": 2.566039029390085e-06, - "loss": 0.1497, + "loss": 0.15, "step": 4838 }, { "epoch": 0.6619699042407661, - "grad_norm": 1.0173803608022471, + "grad_norm": 0.9984247040389589, "learning_rate": 2.5641622110451136e-06, - "loss": 0.1516, + "loss": 0.1524, "step": 4839 }, { "epoch": 0.6621067031463749, - "grad_norm": 1.3277693570932971, + "grad_norm": 1.3051539554474978, "learning_rate": 2.562285842597402e-06, - "loss": 0.237, + "loss": 0.2376, "step": 4840 }, { "epoch": 0.6622435020519836, - "grad_norm": 1.4907787721046188, + "grad_norm": 1.4948614617855607, "learning_rate": 2.5604099243935163e-06, - "loss": 0.2217, + "loss": 0.2216, "step": 4841 }, { "epoch": 0.6623803009575924, - "grad_norm": 1.3710496815873783, + "grad_norm": 1.365090617935389, "learning_rate": 2.5585344567799337e-06, - "loss": 0.2274, + "loss": 0.2276, "step": 4842 }, { "epoch": 0.6625170998632011, - "grad_norm": 1.2346544871376939, + "grad_norm": 1.2172488453421737, "learning_rate": 2.556659440103055e-06, - "loss": 0.1549, + "loss": 0.1535, "step": 4843 }, { "epoch": 0.6626538987688099, - "grad_norm": 1.3949360750450426, + "grad_norm": 1.3755470110157932, "learning_rate": 2.55478487470919e-06, - "loss": 0.2239, + "loss": 0.2246, "step": 4844 }, { "epoch": 0.6627906976744186, - "grad_norm": 1.4240030434783544, + "grad_norm": 1.4173660035284035, "learning_rate": 2.5529107609445737e-06, - "loss": 0.1977, + "loss": 0.1974, "step": 4845 }, { "epoch": 0.6629274965800274, - "grad_norm": 1.1182373187236048, + "grad_norm": 1.11651766978692, "learning_rate": 2.5510370991553503e-06, - "loss": 0.173, + "loss": 0.1741, "step": 4846 }, { "epoch": 0.6630642954856362, - "grad_norm": 1.3314879956291132, + "grad_norm": 1.3294954013931939, "learning_rate": 2.5491638896875835e-06, - "loss": 0.178, + "loss": 0.1777, "step": 4847 }, { "epoch": 0.6632010943912449, - "grad_norm": 1.2362105637705236, + "grad_norm": 1.2175911504957975, "learning_rate": 2.5472911328872574e-06, - "loss": 0.163, + "loss": 0.1628, "step": 4848 }, { "epoch": 0.6633378932968537, - "grad_norm": 1.144328885880804, + "grad_norm": 1.1228420655911282, "learning_rate": 2.545418829100264e-06, - "loss": 0.1771, + "loss": 0.1759, "step": 4849 }, { "epoch": 0.6634746922024624, - "grad_norm": 1.0041277593039208, + "grad_norm": 0.9945921556334596, "learning_rate": 2.5435469786724203e-06, - "loss": 0.1625, + "loss": 0.1608, "step": 4850 }, { "epoch": 0.6636114911080712, - "grad_norm": 1.365850890425173, + "grad_norm": 1.3360119191464819, "learning_rate": 2.541675581949452e-06, - "loss": 0.187, + "loss": 0.1851, "step": 4851 }, { "epoch": 0.6637482900136799, - "grad_norm": 1.1424750282810874, + "grad_norm": 1.1422402714951105, "learning_rate": 2.5398046392770054e-06, - "loss": 0.1605, + "loss": 0.1611, "step": 4852 }, { "epoch": 0.6638850889192887, - "grad_norm": 1.2666605599791436, + "grad_norm": 1.2667898428041993, "learning_rate": 2.5379341510006435e-06, - "loss": 0.2085, + "loss": 0.2101, "step": 4853 }, { "epoch": 0.6640218878248973, - "grad_norm": 1.151546818085437, + "grad_norm": 1.150719573635271, "learning_rate": 2.536064117465845e-06, - "loss": 0.1487, + "loss": 0.1491, "step": 4854 }, { "epoch": 0.6641586867305062, - "grad_norm": 1.1467500319309503, + "grad_norm": 1.1339266089865865, "learning_rate": 2.5341945390179995e-06, - "loss": 0.1789, + "loss": 0.1782, "step": 4855 }, { "epoch": 0.664295485636115, - "grad_norm": 1.2332845075785779, + "grad_norm": 1.2189519528433004, "learning_rate": 2.5323254160024193e-06, - "loss": 0.1897, + "loss": 0.1883, "step": 4856 }, { "epoch": 0.6644322845417237, - "grad_norm": 1.2959105911661537, + "grad_norm": 1.2699746123935554, "learning_rate": 2.5304567487643316e-06, - "loss": 0.1786, + "loss": 0.1762, "step": 4857 }, { "epoch": 0.6645690834473325, - "grad_norm": 1.2223902900993218, + "grad_norm": 1.2176809716506192, "learning_rate": 2.528588537648874e-06, - "loss": 0.172, + "loss": 0.1726, "step": 4858 }, { "epoch": 0.6647058823529411, - "grad_norm": 1.4458905352447935, + "grad_norm": 1.4262531203562536, "learning_rate": 2.526720783001107e-06, - "loss": 0.1997, + "loss": 0.1968, "step": 4859 }, { "epoch": 0.66484268125855, - "grad_norm": 1.1183111717961012, + "grad_norm": 1.1119548945901165, "learning_rate": 2.524853485166e-06, - "loss": 0.1534, + "loss": 0.154, "step": 4860 }, { "epoch": 0.6649794801641586, - "grad_norm": 1.1755687909149077, + "grad_norm": 1.1919257815991122, "learning_rate": 2.522986644488444e-06, - "loss": 0.1558, + "loss": 0.1571, "step": 4861 }, { "epoch": 0.6651162790697674, - "grad_norm": 1.327030239220227, + "grad_norm": 1.3371861126448892, "learning_rate": 2.5211202613132413e-06, - "loss": 0.1853, + "loss": 0.1874, "step": 4862 }, { "epoch": 0.6652530779753763, - "grad_norm": 1.208866421152892, + "grad_norm": 1.1961620558104644, "learning_rate": 2.519254335985112e-06, - "loss": 0.1601, + "loss": 0.1599, "step": 4863 }, { "epoch": 0.6653898768809849, - "grad_norm": 1.0748002095028464, + "grad_norm": 1.0624681470502408, "learning_rate": 2.517388868848692e-06, - "loss": 0.176, + "loss": 0.1743, "step": 4864 }, { "epoch": 0.6655266757865937, - "grad_norm": 1.4766090745390774, + "grad_norm": 1.4515352457484931, "learning_rate": 2.515523860248529e-06, - "loss": 0.1817, + "loss": 0.1822, "step": 4865 }, { "epoch": 0.6656634746922024, - "grad_norm": 1.1750497018236383, + "grad_norm": 1.1590641834188684, "learning_rate": 2.513659310529091e-06, - "loss": 0.1727, + "loss": 0.1722, "step": 4866 }, { "epoch": 0.6658002735978112, - "grad_norm": 1.1923029982485878, + "grad_norm": 1.191801917610696, "learning_rate": 2.5117952200347574e-06, - "loss": 0.1847, + "loss": 0.1872, "step": 4867 }, { "epoch": 0.6659370725034199, - "grad_norm": 1.2148977114636872, + "grad_norm": 1.193633086357466, "learning_rate": 2.5099315891098263e-06, - "loss": 0.1755, + "loss": 0.1756, "step": 4868 }, { "epoch": 0.6660738714090287, - "grad_norm": 1.3627149271424137, + "grad_norm": 1.362982270287415, "learning_rate": 2.508068418098506e-06, - "loss": 0.1928, + "loss": 0.1913, "step": 4869 }, { "epoch": 0.6662106703146374, - "grad_norm": 1.2473155547060226, + "grad_norm": 1.2402691599541613, "learning_rate": 2.5062057073449252e-06, - "loss": 0.2088, + "loss": 0.2063, "step": 4870 }, { "epoch": 0.6663474692202462, - "grad_norm": 1.349876750390011, + "grad_norm": 1.3463672937799356, "learning_rate": 2.504343457193123e-06, - "loss": 0.2119, + "loss": 0.2118, "step": 4871 }, { "epoch": 0.666484268125855, - "grad_norm": 1.304385638785297, + "grad_norm": 1.2971589985452572, "learning_rate": 2.502481667987056e-06, - "loss": 0.1722, + "loss": 0.1729, "step": 4872 }, { "epoch": 0.6666210670314637, - "grad_norm": 1.3708689711916797, + "grad_norm": 1.3067706811396484, "learning_rate": 2.5006203400705976e-06, - "loss": 0.1986, + "loss": 0.1955, "step": 4873 }, { "epoch": 0.6667578659370725, - "grad_norm": 1.234314466131691, + "grad_norm": 1.226385230908909, "learning_rate": 2.498759473787529e-06, - "loss": 0.1516, + "loss": 0.1502, "step": 4874 }, { "epoch": 0.6668946648426812, - "grad_norm": 1.408683670303967, + "grad_norm": 1.3971833850781596, "learning_rate": 2.496899069481555e-06, - "loss": 0.1911, + "loss": 0.1923, "step": 4875 }, { "epoch": 0.66703146374829, - "grad_norm": 1.152703628388308, + "grad_norm": 1.1417451521662887, "learning_rate": 2.495039127496287e-06, - "loss": 0.1866, + "loss": 0.1857, "step": 4876 }, { "epoch": 0.6671682626538987, - "grad_norm": 1.4746986505510529, + "grad_norm": 1.443246414987754, "learning_rate": 2.493179648175259e-06, - "loss": 0.1778, + "loss": 0.1767, "step": 4877 }, { "epoch": 0.6673050615595075, - "grad_norm": 1.410800649016814, + "grad_norm": 1.4002970713880847, "learning_rate": 2.491320631861911e-06, - "loss": 0.204, + "loss": 0.2029, "step": 4878 }, { "epoch": 0.6674418604651163, - "grad_norm": 1.124045460886258, + "grad_norm": 1.1126753971355696, "learning_rate": 2.489462078899604e-06, - "loss": 0.1659, + "loss": 0.1656, "step": 4879 }, { "epoch": 0.667578659370725, - "grad_norm": 1.0164142614795484, + "grad_norm": 1.0121603567364603, "learning_rate": 2.4876039896316122e-06, "loss": 0.1611, "step": 4880 }, { "epoch": 0.6677154582763338, - "grad_norm": 1.4865995591405174, + "grad_norm": 1.4638178727840399, "learning_rate": 2.4857463644011207e-06, - "loss": 0.2157, + "loss": 0.2162, "step": 4881 }, { "epoch": 0.6678522571819425, - "grad_norm": 1.065549486092714, + "grad_norm": 1.052321770249507, "learning_rate": 2.483889203551233e-06, - "loss": 0.1841, + "loss": 0.1847, "step": 4882 }, { "epoch": 0.6679890560875513, - "grad_norm": 1.434556308954014, + "grad_norm": 1.4404377550951155, "learning_rate": 2.482032507424964e-06, - "loss": 0.2141, + "loss": 0.2143, "step": 4883 }, { "epoch": 0.66812585499316, - "grad_norm": 1.2658987470205227, + "grad_norm": 1.3032543857503804, "learning_rate": 2.4801762763652477e-06, - "loss": 0.1537, + "loss": 0.1558, "step": 4884 }, { "epoch": 0.6682626538987688, - "grad_norm": 1.3935967704686913, + "grad_norm": 1.3746531213225075, "learning_rate": 2.4783205107149226e-06, - "loss": 0.1679, + "loss": 0.1678, "step": 4885 }, { "epoch": 0.6683994528043775, - "grad_norm": 1.4125246795253974, + "grad_norm": 1.4115161591646561, "learning_rate": 2.476465210816753e-06, - "loss": 0.2106, + "loss": 0.2088, "step": 4886 }, { "epoch": 0.6685362517099863, - "grad_norm": 1.2459025997649829, + "grad_norm": 1.2423056810727127, "learning_rate": 2.4746103770134062e-06, - "loss": 0.1528, + "loss": 0.154, "step": 4887 }, { "epoch": 0.6686730506155951, - "grad_norm": 0.9559087880927244, + "grad_norm": 0.9457360088193327, "learning_rate": 2.4727560096474706e-06, - "loss": 0.1459, + "loss": 0.1456, "step": 4888 }, { "epoch": 0.6688098495212038, - "grad_norm": 1.4168599159149595, + "grad_norm": 1.4115092615408278, "learning_rate": 2.4709021090614483e-06, "loss": 0.1807, "step": 4889 }, { "epoch": 0.6689466484268126, - "grad_norm": 1.2080433063712934, + "grad_norm": 1.198974026008207, "learning_rate": 2.4690486755977495e-06, - "loss": 0.1921, + "loss": 0.1913, "step": 4890 }, { "epoch": 0.6690834473324213, - "grad_norm": 1.374938328801805, + "grad_norm": 1.3644907827592798, "learning_rate": 2.467195709598706e-06, - "loss": 0.1759, + "loss": 0.1743, "step": 4891 }, { "epoch": 0.6692202462380301, - "grad_norm": 1.3482468793229443, + "grad_norm": 1.3376906054017277, "learning_rate": 2.4653432114065547e-06, "loss": 0.2102, "step": 4892 }, { "epoch": 0.6693570451436388, - "grad_norm": 1.3834671204778688, + "grad_norm": 1.3877621185394984, "learning_rate": 2.4634911813634545e-06, - "loss": 0.2065, + "loss": 0.2062, "step": 4893 }, { "epoch": 0.6694938440492476, - "grad_norm": 1.183077315627524, + "grad_norm": 1.1688266492274813, "learning_rate": 2.46163961981147e-06, - "loss": 0.2008, + "loss": 0.2022, "step": 4894 }, { "epoch": 0.6696306429548564, - "grad_norm": 1.0770274877973398, + "grad_norm": 1.0690094920775943, "learning_rate": 2.4597885270925847e-06, - "loss": 0.152, + "loss": 0.1515, "step": 4895 }, { "epoch": 0.6697674418604651, - "grad_norm": 1.1635012225203665, + "grad_norm": 1.145062442917028, "learning_rate": 2.457937903548695e-06, - "loss": 0.162, + "loss": 0.1614, "step": 4896 }, { "epoch": 0.6699042407660739, - "grad_norm": 1.1744941719519257, + "grad_norm": 1.1663580171871069, "learning_rate": 2.456087749521609e-06, - "loss": 0.1671, + "loss": 0.1661, "step": 4897 }, { "epoch": 0.6700410396716826, - "grad_norm": 1.253797442641249, + "grad_norm": 1.2488559504900092, "learning_rate": 2.45423806535305e-06, - "loss": 0.213, + "loss": 0.2138, "step": 4898 }, { "epoch": 0.6701778385772914, - "grad_norm": 1.3528010576532767, + "grad_norm": 1.3282003232119057, "learning_rate": 2.45238885138465e-06, - "loss": 0.2455, + "loss": 0.2425, "step": 4899 }, { "epoch": 0.6703146374829001, - "grad_norm": 1.5283765234913687, + "grad_norm": 1.5214960100940687, "learning_rate": 2.4505401079579612e-06, - "loss": 0.2276, + "loss": 0.2287, "step": 4900 }, { "epoch": 0.6703146374829001, - "eval_loss": 0.17813456058502197, - "eval_runtime": 5.9121, - "eval_samples_per_second": 5.074, - "eval_steps_per_second": 1.353, + "eval_loss": 0.17810727655887604, + "eval_runtime": 5.9152, + "eval_samples_per_second": 5.072, + "eval_steps_per_second": 1.352, "step": 4900 }, { "epoch": 0.6704514363885089, - "grad_norm": 1.4577872105246095, + "grad_norm": 1.458421952561363, "learning_rate": 2.44869183541444e-06, - "loss": 0.1985, + "loss": 0.2021, "step": 4901 }, { "epoch": 0.6705882352941176, - "grad_norm": 1.0182374636376132, + "grad_norm": 1.0079016952950712, "learning_rate": 2.4468440340954664e-06, - "loss": 0.1531, + "loss": 0.154, "step": 4902 }, { "epoch": 0.6707250341997264, - "grad_norm": 1.3740043218173401, + "grad_norm": 1.3789832626111307, "learning_rate": 2.4449967043423226e-06, - "loss": 0.2015, + "loss": 0.2029, "step": 4903 }, { "epoch": 0.6708618331053352, - "grad_norm": 1.2615428669030526, + "grad_norm": 1.2476134878244853, "learning_rate": 2.4431498464962124e-06, - "loss": 0.2027, + "loss": 0.202, "step": 4904 }, { "epoch": 0.6709986320109439, - "grad_norm": 1.2093654476309137, + "grad_norm": 1.1976925732293275, "learning_rate": 2.441303460898249e-06, - "loss": 0.1671, + "loss": 0.1679, "step": 4905 }, { "epoch": 0.6711354309165527, - "grad_norm": 1.3508774358056153, + "grad_norm": 1.3321720773045178, "learning_rate": 2.4394575478894557e-06, - "loss": 0.2014, + "loss": 0.1993, "step": 4906 }, { "epoch": 0.6712722298221614, - "grad_norm": 0.9834510502402257, + "grad_norm": 0.9726804960846838, "learning_rate": 2.437612107810774e-06, - "loss": 0.141, + "loss": 0.1402, "step": 4907 }, { "epoch": 0.6714090287277702, - "grad_norm": 1.175219420612827, + "grad_norm": 1.166696238065039, "learning_rate": 2.4357671410030525e-06, - "loss": 0.1683, + "loss": 0.1681, "step": 4908 }, { "epoch": 0.6715458276333789, - "grad_norm": 1.0059060892172549, + "grad_norm": 1.0160912013343573, "learning_rate": 2.4339226478070566e-06, - "loss": 0.1674, + "loss": 0.1696, "step": 4909 }, { "epoch": 0.6716826265389877, - "grad_norm": 1.0840864127485046, + "grad_norm": 1.0858722624937087, "learning_rate": 2.432078628563464e-06, - "loss": 0.1435, + "loss": 0.143, "step": 4910 }, { "epoch": 0.6718194254445965, - "grad_norm": 1.3930020439550872, + "grad_norm": 1.3851193363266971, "learning_rate": 2.43023508361286e-06, - "loss": 0.2046, + "loss": 0.2031, "step": 4911 }, { "epoch": 0.6719562243502052, - "grad_norm": 1.2969825437150335, + "grad_norm": 1.27752461063065, "learning_rate": 2.4283920132957484e-06, - "loss": 0.2196, + "loss": 0.2211, "step": 4912 }, { "epoch": 0.672093023255814, - "grad_norm": 1.2020641851265126, + "grad_norm": 1.2106910046372767, "learning_rate": 2.4265494179525423e-06, - "loss": 0.1687, + "loss": 0.1686, "step": 4913 }, { "epoch": 0.6722298221614227, - "grad_norm": 1.0410590043298542, + "grad_norm": 1.0254331156898404, "learning_rate": 2.424707297923569e-06, - "loss": 0.1444, + "loss": 0.1429, "step": 4914 }, { "epoch": 0.6723666210670315, - "grad_norm": 1.0963969766027086, + "grad_norm": 1.0747979580080564, "learning_rate": 2.4228656535490636e-06, - "loss": 0.1646, + "loss": 0.1621, "step": 4915 }, { "epoch": 0.6725034199726402, - "grad_norm": 0.9681074385562591, + "grad_norm": 0.9568963946936622, "learning_rate": 2.42102448516918e-06, - "loss": 0.1448, + "loss": 0.1433, "step": 4916 }, { "epoch": 0.672640218878249, - "grad_norm": 1.221566651929673, + "grad_norm": 1.213489183131025, "learning_rate": 2.419183793123976e-06, - "loss": 0.1765, + "loss": 0.1766, "step": 4917 }, { "epoch": 0.6727770177838577, - "grad_norm": 1.3117124402427067, + "grad_norm": 1.2926928266718967, "learning_rate": 2.4173435777534304e-06, - "loss": 0.1987, + "loss": 0.1983, "step": 4918 }, { "epoch": 0.6729138166894665, - "grad_norm": 1.3118721959728357, + "grad_norm": 1.3064936009555626, "learning_rate": 2.415503839397426e-06, - "loss": 0.1678, + "loss": 0.166, "step": 4919 }, { "epoch": 0.6730506155950753, - "grad_norm": 1.1525925497234863, + "grad_norm": 1.1543288632599618, "learning_rate": 2.4136645783957612e-06, - "loss": 0.1949, + "loss": 0.1939, "step": 4920 }, { "epoch": 0.673187414500684, - "grad_norm": 1.157602255056764, + "grad_norm": 1.150885383164371, "learning_rate": 2.4118257950881495e-06, - "loss": 0.1974, + "loss": 0.1977, "step": 4921 }, { "epoch": 0.6733242134062928, - "grad_norm": 1.1648846757246796, + "grad_norm": 1.1545160513336912, "learning_rate": 2.4099874898142088e-06, - "loss": 0.1652, + "loss": 0.1656, "step": 4922 }, { "epoch": 0.6734610123119015, - "grad_norm": 1.392535362336739, + "grad_norm": 1.3999768011822655, "learning_rate": 2.4081496629134766e-06, - "loss": 0.2066, + "loss": 0.2098, "step": 4923 }, { "epoch": 0.6735978112175103, - "grad_norm": 1.3095761924800984, + "grad_norm": 1.2955837070248695, "learning_rate": 2.4063123147253924e-06, - "loss": 0.2, + "loss": 0.1998, "step": 4924 }, { "epoch": 0.673734610123119, - "grad_norm": 1.226189928300003, + "grad_norm": 1.2137469332615047, "learning_rate": 2.404475445589317e-06, - "loss": 0.1775, + "loss": 0.1764, "step": 4925 }, { "epoch": 0.6738714090287278, - "grad_norm": 1.3029260127656086, + "grad_norm": 1.3142141943419858, "learning_rate": 2.4026390558445177e-06, - "loss": 0.1847, + "loss": 0.1829, "step": 4926 }, { "epoch": 0.6740082079343366, - "grad_norm": 1.0867320110738388, + "grad_norm": 1.0676432592241771, "learning_rate": 2.400803145830174e-06, - "loss": 0.169, + "loss": 0.1682, "step": 4927 }, { "epoch": 0.6741450068399453, - "grad_norm": 1.0117525030942895, + "grad_norm": 0.9973493291513608, "learning_rate": 2.398967715885379e-06, - "loss": 0.1721, + "loss": 0.1733, "step": 4928 }, { "epoch": 0.6742818057455541, - "grad_norm": 1.4064752930604991, + "grad_norm": 1.3891481509058066, "learning_rate": 2.3971327663491317e-06, - "loss": 0.1659, + "loss": 0.1675, "step": 4929 }, { "epoch": 0.6744186046511628, - "grad_norm": 1.1841302481613862, + "grad_norm": 1.1902127846206028, "learning_rate": 2.3952982975603494e-06, - "loss": 0.1416, + "loss": 0.1422, "step": 4930 }, { "epoch": 0.6745554035567716, - "grad_norm": 1.0747221418188244, + "grad_norm": 1.0688382751216072, "learning_rate": 2.393464309857853e-06, - "loss": 0.1503, + "loss": 0.1515, "step": 4931 }, { "epoch": 0.6746922024623803, - "grad_norm": 1.129207420681873, + "grad_norm": 1.1256734481846637, "learning_rate": 2.391630803580382e-06, - "loss": 0.1822, + "loss": 0.1813, "step": 4932 }, { "epoch": 0.6748290013679891, - "grad_norm": 1.469975845978556, + "grad_norm": 1.4361712580184556, "learning_rate": 2.389797779066581e-06, - "loss": 0.1873, + "loss": 0.1874, "step": 4933 }, { "epoch": 0.6749658002735978, - "grad_norm": 1.3781092877458403, + "grad_norm": 1.3752209013651704, "learning_rate": 2.3879652366550105e-06, - "loss": 0.2183, + "loss": 0.2188, "step": 4934 }, { "epoch": 0.6751025991792066, - "grad_norm": 1.2914276858191973, + "grad_norm": 1.2927622185460048, "learning_rate": 2.3861331766841366e-06, - "loss": 0.1758, + "loss": 0.1759, "step": 4935 }, { "epoch": 0.6752393980848154, - "grad_norm": 1.4713591752744486, + "grad_norm": 1.4556887041336735, "learning_rate": 2.3843015994923415e-06, - "loss": 0.236, + "loss": 0.2368, "step": 4936 }, { "epoch": 0.6753761969904241, - "grad_norm": 1.1076962546871676, + "grad_norm": 1.094685958340429, "learning_rate": 2.3824705054179164e-06, - "loss": 0.1482, + "loss": 0.149, "step": 4937 }, { "epoch": 0.6755129958960329, - "grad_norm": 1.2266181911410654, + "grad_norm": 1.2050008534283807, "learning_rate": 2.380639894799061e-06, - "loss": 0.194, + "loss": 0.1926, "step": 4938 }, { "epoch": 0.6756497948016416, - "grad_norm": 1.2622446270587473, + "grad_norm": 1.2452023767941993, "learning_rate": 2.378809767973888e-06, - "loss": 0.1766, + "loss": 0.1757, "step": 4939 }, { "epoch": 0.6757865937072504, - "grad_norm": 1.3473926300480736, + "grad_norm": 1.3173898947841558, "learning_rate": 2.3769801252804213e-06, - "loss": 0.196, + "loss": 0.1958, "step": 4940 }, { "epoch": 0.675923392612859, - "grad_norm": 1.4339937937568823, + "grad_norm": 1.4081767259363696, "learning_rate": 2.375150967056596e-06, - "loss": 0.2401, + "loss": 0.2394, "step": 4941 }, { "epoch": 0.6760601915184679, - "grad_norm": 1.485839527797769, + "grad_norm": 1.4955198933342795, "learning_rate": 2.3733222936402524e-06, - "loss": 0.214, + "loss": 0.216, "step": 4942 }, { "epoch": 0.6761969904240767, - "grad_norm": 1.129789918885452, + "grad_norm": 1.1077259409164655, "learning_rate": 2.3714941053691465e-06, - "loss": 0.1731, + "loss": 0.1727, "step": 4943 }, { "epoch": 0.6763337893296854, - "grad_norm": 1.129288229448515, + "grad_norm": 1.0988674547141153, "learning_rate": 2.369666402580946e-06, - "loss": 0.181, + "loss": 0.1804, "step": 4944 }, { "epoch": 0.6764705882352942, - "grad_norm": 1.3132273415602433, + "grad_norm": 1.302335469695983, "learning_rate": 2.3678391856132203e-06, - "loss": 0.1714, + "loss": 0.1687, "step": 4945 }, { "epoch": 0.6766073871409028, - "grad_norm": 1.2460846176618443, + "grad_norm": 1.2469160013767577, "learning_rate": 2.3660124548034615e-06, - "loss": 0.1762, + "loss": 0.1773, "step": 4946 }, { "epoch": 0.6767441860465117, - "grad_norm": 1.2217670419183828, + "grad_norm": 1.2124652279685804, "learning_rate": 2.3641862104890594e-06, - "loss": 0.1464, + "loss": 0.146, "step": 4947 }, { "epoch": 0.6768809849521203, - "grad_norm": 1.2083959249709755, + "grad_norm": 1.1877230633382996, "learning_rate": 2.3623604530073248e-06, - "loss": 0.1663, + "loss": 0.1641, "step": 4948 }, { "epoch": 0.6770177838577291, - "grad_norm": 1.360277043508187, + "grad_norm": 1.3307636964775296, "learning_rate": 2.3605351826954687e-06, - "loss": 0.225, + "loss": 0.2243, "step": 4949 }, { "epoch": 0.6771545827633378, - "grad_norm": 0.9125206356416365, + "grad_norm": 0.9188156880922211, "learning_rate": 2.3587103998906218e-06, - "loss": 0.1298, + "loss": 0.13, "step": 4950 }, { "epoch": 0.6772913816689466, - "grad_norm": 1.2362771437028248, + "grad_norm": 1.2227621295584552, "learning_rate": 2.356886104929816e-06, - "loss": 0.1733, + "loss": 0.1719, "step": 4951 }, { "epoch": 0.6774281805745554, - "grad_norm": 1.3220520056293321, + "grad_norm": 1.3085595985999445, "learning_rate": 2.355062298149999e-06, - "loss": 0.1736, + "loss": 0.1739, "step": 4952 }, { "epoch": 0.6775649794801641, - "grad_norm": 1.321662517566648, + "grad_norm": 1.3198190187048293, "learning_rate": 2.353238979888028e-06, - "loss": 0.2221, + "loss": 0.2229, "step": 4953 }, { "epoch": 0.677701778385773, - "grad_norm": 1.3380286869409126, + "grad_norm": 1.3421235713832302, "learning_rate": 2.351416150480665e-06, - "loss": 0.1926, + "loss": 0.1949, "step": 4954 }, { "epoch": 0.6778385772913816, - "grad_norm": 1.407835114776776, + "grad_norm": 1.3828594773424885, "learning_rate": 2.3495938102645865e-06, - "loss": 0.2171, + "loss": 0.2152, "step": 4955 }, { "epoch": 0.6779753761969904, - "grad_norm": 1.1988592465003096, + "grad_norm": 1.195492675156494, "learning_rate": 2.3477719595763778e-06, - "loss": 0.1707, + "loss": 0.1726, "step": 4956 }, { "epoch": 0.6781121751025991, - "grad_norm": 1.0964858220958558, + "grad_norm": 1.0859442752933146, "learning_rate": 2.3459505987525348e-06, - "loss": 0.1789, + "loss": 0.1803, "step": 4957 }, { "epoch": 0.6782489740082079, - "grad_norm": 1.1549581664163135, + "grad_norm": 1.1592499620252734, "learning_rate": 2.344129728129458e-06, - "loss": 0.1969, + "loss": 0.1986, "step": 4958 }, { "epoch": 0.6783857729138167, - "grad_norm": 1.2309871499866738, + "grad_norm": 1.2315615207408817, "learning_rate": 2.3423093480434617e-06, - "loss": 0.1462, + "loss": 0.1451, "step": 4959 }, { "epoch": 0.6785225718194254, - "grad_norm": 1.1332882310718329, + "grad_norm": 1.1209202941524035, "learning_rate": 2.340489458830772e-06, - "loss": 0.1759, + "loss": 0.1755, "step": 4960 }, { "epoch": 0.6786593707250342, - "grad_norm": 1.1441306210832836, + "grad_norm": 1.124960775091579, "learning_rate": 2.338670060827515e-06, - "loss": 0.1482, + "loss": 0.146, "step": 4961 }, { "epoch": 0.6787961696306429, - "grad_norm": 1.189038214459137, + "grad_norm": 1.177234737596389, "learning_rate": 2.336851154369739e-06, - "loss": 0.1654, + "loss": 0.1648, "step": 4962 }, { "epoch": 0.6789329685362517, - "grad_norm": 1.1541804606910868, + "grad_norm": 1.1449907298922914, "learning_rate": 2.3350327397933873e-06, - "loss": 0.1811, + "loss": 0.1799, "step": 4963 }, { "epoch": 0.6790697674418604, - "grad_norm": 0.8365973911070362, + "grad_norm": 0.8352071159675597, "learning_rate": 2.3332148174343257e-06, - "loss": 0.1354, + "loss": 0.1352, "step": 4964 }, { "epoch": 0.6792065663474692, - "grad_norm": 1.2230417666975861, + "grad_norm": 1.21896564117684, "learning_rate": 2.3313973876283186e-06, - "loss": 0.1619, + "loss": 0.1611, "step": 4965 }, { "epoch": 0.6793433652530779, - "grad_norm": 1.2653933715484103, + "grad_norm": 1.2709456464425393, "learning_rate": 2.329580450711047e-06, - "loss": 0.1736, + "loss": 0.1748, "step": 4966 }, { "epoch": 0.6794801641586867, - "grad_norm": 1.2353872370645422, + "grad_norm": 1.253503984090524, "learning_rate": 2.3277640070180947e-06, - "loss": 0.1896, + "loss": 0.1905, "step": 4967 }, { "epoch": 0.6796169630642955, - "grad_norm": 1.6122900665442208, + "grad_norm": 1.596371651605194, "learning_rate": 2.3259480568849583e-06, - "loss": 0.2436, + "loss": 0.245, "step": 4968 }, { "epoch": 0.6797537619699042, - "grad_norm": 1.2124930945114742, + "grad_norm": 1.2057165712679168, "learning_rate": 2.324132600647044e-06, - "loss": 0.1717, + "loss": 0.1721, "step": 4969 }, { "epoch": 0.679890560875513, - "grad_norm": 1.3408109394002414, + "grad_norm": 1.3343557145149867, "learning_rate": 2.3223176386396628e-06, - "loss": 0.2043, + "loss": 0.2049, "step": 4970 }, { "epoch": 0.6800273597811217, - "grad_norm": 1.3034618577539288, + "grad_norm": 1.3182500118373195, "learning_rate": 2.3205031711980408e-06, - "loss": 0.1931, + "loss": 0.1929, "step": 4971 }, { "epoch": 0.6801641586867305, - "grad_norm": 1.1280398574401744, + "grad_norm": 1.135227219271566, "learning_rate": 2.3186891986573034e-06, - "loss": 0.1608, + "loss": 0.1604, "step": 4972 }, { "epoch": 0.6803009575923392, - "grad_norm": 1.265676291349564, + "grad_norm": 1.268962659010694, "learning_rate": 2.3168757213524946e-06, - "loss": 0.1698, + "loss": 0.1706, "step": 4973 }, { "epoch": 0.680437756497948, - "grad_norm": 1.2828857004648095, + "grad_norm": 1.2696310998318077, "learning_rate": 2.315062739618557e-06, - "loss": 0.1875, + "loss": 0.1873, "step": 4974 }, { "epoch": 0.6805745554035568, - "grad_norm": 1.2508711089029791, + "grad_norm": 1.2390163430217132, "learning_rate": 2.3132502537903506e-06, - "loss": 0.2306, + "loss": 0.2311, "step": 4975 }, { "epoch": 0.6807113543091655, - "grad_norm": 1.449228474915071, + "grad_norm": 1.4466523865721443, "learning_rate": 2.3114382642026407e-06, - "loss": 0.2487, + "loss": 0.2504, "step": 4976 }, { "epoch": 0.6808481532147743, - "grad_norm": 1.3955839416830311, + "grad_norm": 1.3917292777458852, "learning_rate": 2.3096267711900967e-06, - "loss": 0.1924, + "loss": 0.1919, "step": 4977 }, { "epoch": 0.680984952120383, - "grad_norm": 1.158141624415379, + "grad_norm": 1.1429385378463586, "learning_rate": 2.3078157750873044e-06, - "loss": 0.1807, + "loss": 0.1804, "step": 4978 }, { "epoch": 0.6811217510259918, - "grad_norm": 1.5867627427385556, + "grad_norm": 1.5690177337565996, "learning_rate": 2.306005276228748e-06, - "loss": 0.1982, + "loss": 0.196, "step": 4979 }, { "epoch": 0.6812585499316005, - "grad_norm": 1.355469272171621, + "grad_norm": 1.3581353468843393, "learning_rate": 2.3041952749488305e-06, - "loss": 0.1861, + "loss": 0.1862, "step": 4980 }, { "epoch": 0.6813953488372093, - "grad_norm": 1.3110871710877654, + "grad_norm": 1.323508793004683, "learning_rate": 2.302385771581853e-06, - "loss": 0.2113, + "loss": 0.2148, "step": 4981 }, { "epoch": 0.681532147742818, - "grad_norm": 1.1763073411917282, + "grad_norm": 1.1615524320542407, "learning_rate": 2.300576766462032e-06, - "loss": 0.1646, + "loss": 0.1649, "step": 4982 }, { "epoch": 0.6816689466484268, - "grad_norm": 1.1665692014815656, + "grad_norm": 1.151446952159123, "learning_rate": 2.29876825992349e-06, - "loss": 0.141, + "loss": 0.1402, "step": 4983 }, { "epoch": 0.6818057455540356, - "grad_norm": 1.1976415347562064, + "grad_norm": 1.1951558317525979, "learning_rate": 2.296960252300254e-06, - "loss": 0.1957, + "loss": 0.1978, "step": 4984 }, { "epoch": 0.6819425444596443, - "grad_norm": 1.0672421361465452, + "grad_norm": 1.0563022405729976, "learning_rate": 2.2951527439262626e-06, - "loss": 0.1909, + "loss": 0.1908, "step": 4985 }, { "epoch": 0.6820793433652531, - "grad_norm": 1.2420709935708638, + "grad_norm": 1.2470412498331538, "learning_rate": 2.2933457351353623e-06, - "loss": 0.1587, + "loss": 0.1603, "step": 4986 }, { "epoch": 0.6822161422708618, - "grad_norm": 1.3312699687474083, + "grad_norm": 1.3231725418378, "learning_rate": 2.291539226261307e-06, - "loss": 0.1636, + "loss": 0.1619, "step": 4987 }, { "epoch": 0.6823529411764706, - "grad_norm": 1.3839850984447153, + "grad_norm": 1.389471119651987, "learning_rate": 2.289733217637753e-06, - "loss": 0.1855, + "loss": 0.1878, "step": 4988 }, { "epoch": 0.6824897400820793, - "grad_norm": 1.0901989756131591, + "grad_norm": 1.09272754981243, "learning_rate": 2.287927709598274e-06, - "loss": 0.1544, + "loss": 0.1556, "step": 4989 }, { "epoch": 0.6826265389876881, - "grad_norm": 1.4541078842158213, + "grad_norm": 1.4475713300832398, "learning_rate": 2.2861227024763413e-06, - "loss": 0.2124, + "loss": 0.2143, "step": 4990 }, { "epoch": 0.6827633378932969, - "grad_norm": 1.10368135892123, + "grad_norm": 1.116859313308471, "learning_rate": 2.28431819660534e-06, - "loss": 0.17, + "loss": 0.169, "step": 4991 }, { "epoch": 0.6829001367989056, - "grad_norm": 1.1055786539490124, + "grad_norm": 1.1205045306767882, "learning_rate": 2.2825141923185636e-06, - "loss": 0.1796, + "loss": 0.1793, "step": 4992 }, { "epoch": 0.6830369357045144, - "grad_norm": 1.2602318041195693, + "grad_norm": 1.2619966866115158, "learning_rate": 2.2807106899492056e-06, - "loss": 0.1846, + "loss": 0.1862, "step": 4993 }, { "epoch": 0.6831737346101231, - "grad_norm": 1.177064533075529, + "grad_norm": 1.1713747504197198, "learning_rate": 2.2789076898303745e-06, - "loss": 0.1738, + "loss": 0.1745, "step": 4994 }, { "epoch": 0.6833105335157319, - "grad_norm": 1.3723135161612299, + "grad_norm": 1.398005617192176, "learning_rate": 2.277105192295081e-06, - "loss": 0.2198, + "loss": 0.2222, "step": 4995 }, { "epoch": 0.6834473324213406, - "grad_norm": 1.22189547683513, + "grad_norm": 1.219847611822087, "learning_rate": 2.275303197676248e-06, - "loss": 0.1424, + "loss": 0.142, "step": 4996 }, { "epoch": 0.6835841313269494, - "grad_norm": 1.3802107716036411, + "grad_norm": 1.344766553568672, "learning_rate": 2.2735017063066983e-06, - "loss": 0.1825, + "loss": 0.1794, "step": 4997 }, { "epoch": 0.6837209302325581, - "grad_norm": 1.2950725552899247, + "grad_norm": 1.295849727596888, "learning_rate": 2.2717007185191673e-06, - "loss": 0.1803, + "loss": 0.1808, "step": 4998 }, { "epoch": 0.6838577291381669, - "grad_norm": 1.3003401204988156, + "grad_norm": 1.279654973355164, "learning_rate": 2.2699002346462974e-06, - "loss": 0.1927, + "loss": 0.1933, "step": 4999 }, { "epoch": 0.6839945280437757, - "grad_norm": 1.1643521683632525, + "grad_norm": 1.154329246328552, "learning_rate": 2.2681002550206355e-06, - "loss": 0.1474, + "loss": 0.1469, "step": 5000 }, { "epoch": 0.6839945280437757, - "eval_loss": 0.17707720398902893, - "eval_runtime": 5.8975, - "eval_samples_per_second": 5.087, - "eval_steps_per_second": 1.357, + "eval_loss": 0.1773664653301239, + "eval_runtime": 5.914, + "eval_samples_per_second": 5.073, + "eval_steps_per_second": 1.353, "step": 5000 }, { "epoch": 0.6841313269493844, - "grad_norm": 0.980257572588687, + "grad_norm": 0.9854371901462428, "learning_rate": 2.2663007799746382e-06, - "loss": 0.154, + "loss": 0.1536, "step": 5001 }, { "epoch": 0.6842681258549932, - "grad_norm": 1.325465208421415, + "grad_norm": 1.2913351602827505, "learning_rate": 2.2645018098406647e-06, - "loss": 0.1809, + "loss": 0.1806, "step": 5002 }, { "epoch": 0.6844049247606019, - "grad_norm": 1.3943481657960197, + "grad_norm": 1.3982896740611845, "learning_rate": 2.262703344950985e-06, - "loss": 0.1961, + "loss": 0.1958, "step": 5003 }, { "epoch": 0.6845417236662107, - "grad_norm": 1.207713061429283, + "grad_norm": 1.1963553661819084, "learning_rate": 2.2609053856377715e-06, - "loss": 0.1768, + "loss": 0.1763, "step": 5004 }, { "epoch": 0.6846785225718194, - "grad_norm": 1.1244609724450279, + "grad_norm": 1.1146860268171526, "learning_rate": 2.2591079322331095e-06, - "loss": 0.1811, + "loss": 0.18, "step": 5005 }, { "epoch": 0.6848153214774282, - "grad_norm": 1.3242031087912076, + "grad_norm": 1.3287290632285138, "learning_rate": 2.2573109850689835e-06, - "loss": 0.183, + "loss": 0.1849, "step": 5006 }, { "epoch": 0.684952120383037, - "grad_norm": 1.0682043502380998, + "grad_norm": 1.0696357190970736, "learning_rate": 2.2555145444772896e-06, - "loss": 0.1698, + "loss": 0.1699, "step": 5007 }, { "epoch": 0.6850889192886457, - "grad_norm": 1.1853338319213167, + "grad_norm": 1.1710996618699585, "learning_rate": 2.253718610789831e-06, - "loss": 0.1815, + "loss": 0.1812, "step": 5008 }, { "epoch": 0.6852257181942545, - "grad_norm": 1.2167230893930003, + "grad_norm": 1.221713235299781, "learning_rate": 2.251923184338312e-06, - "loss": 0.1674, + "loss": 0.169, "step": 5009 }, { "epoch": 0.6853625170998632, - "grad_norm": 1.2039231363356706, + "grad_norm": 1.1980501290637917, "learning_rate": 2.25012826545435e-06, - "loss": 0.161, + "loss": 0.1618, "step": 5010 }, { "epoch": 0.685499316005472, - "grad_norm": 1.1283692881425063, + "grad_norm": 1.114234388372556, "learning_rate": 2.2483338544694603e-06, - "loss": 0.143, + "loss": 0.1419, "step": 5011 }, { "epoch": 0.6856361149110807, - "grad_norm": 1.1761063359257922, + "grad_norm": 1.1844751241081812, "learning_rate": 2.246539951715072e-06, - "loss": 0.1621, + "loss": 0.1644, "step": 5012 }, { "epoch": 0.6857729138166895, - "grad_norm": 1.1835536321385218, + "grad_norm": 1.19375630621345, "learning_rate": 2.24474655752252e-06, - "loss": 0.1765, + "loss": 0.1777, "step": 5013 }, { "epoch": 0.6859097127222982, - "grad_norm": 1.2298044745242902, + "grad_norm": 1.2260312605163812, "learning_rate": 2.2429536722230365e-06, - "loss": 0.1844, + "loss": 0.1847, "step": 5014 }, { "epoch": 0.686046511627907, - "grad_norm": 1.221215784483761, + "grad_norm": 1.214725125913456, "learning_rate": 2.2411612961477704e-06, - "loss": 0.1719, + "loss": 0.1702, "step": 5015 }, { "epoch": 0.6861833105335158, - "grad_norm": 1.7011368506293627, + "grad_norm": 1.6784877551273543, "learning_rate": 2.239369429627771e-06, - "loss": 0.2318, + "loss": 0.2324, "step": 5016 }, { "epoch": 0.6863201094391245, - "grad_norm": 1.1998498105967885, + "grad_norm": 1.2025603773675768, "learning_rate": 2.2375780729939966e-06, - "loss": 0.1792, + "loss": 0.1807, "step": 5017 }, { "epoch": 0.6864569083447333, - "grad_norm": 1.3253017118572707, + "grad_norm": 1.3143741103572812, "learning_rate": 2.2357872265773058e-06, - "loss": 0.1601, + "loss": 0.1592, "step": 5018 }, { "epoch": 0.686593707250342, - "grad_norm": 1.1584589267095575, + "grad_norm": 1.1532991174768064, "learning_rate": 2.233996890708469e-06, - "loss": 0.1763, + "loss": 0.1774, "step": 5019 }, { "epoch": 0.6867305061559508, - "grad_norm": 1.2378704763823258, + "grad_norm": 1.218487116985439, "learning_rate": 2.2322070657181583e-06, - "loss": 0.181, + "loss": 0.1805, "step": 5020 }, { "epoch": 0.6868673050615595, - "grad_norm": 1.271768029728966, + "grad_norm": 1.2750076839671742, "learning_rate": 2.230417751936955e-06, - "loss": 0.19, + "loss": 0.1903, "step": 5021 }, { "epoch": 0.6870041039671683, - "grad_norm": 1.1517813081251391, + "grad_norm": 1.1515475071538723, "learning_rate": 2.22862894969534e-06, - "loss": 0.181, + "loss": 0.1812, "step": 5022 }, { "epoch": 0.6871409028727771, - "grad_norm": 0.9960287311219791, + "grad_norm": 1.0002553288661409, "learning_rate": 2.226840659323706e-06, "loss": 0.1757, "step": 5023 }, { "epoch": 0.6872777017783858, - "grad_norm": 1.0921148326900967, + "grad_norm": 1.0896528435405806, "learning_rate": 2.2250528811523513e-06, - "loss": 0.1615, + "loss": 0.161, "step": 5024 }, { "epoch": 0.6874145006839946, - "grad_norm": 1.400912109634462, + "grad_norm": 1.3971366258564863, "learning_rate": 2.223265615511473e-06, - "loss": 0.2096, + "loss": 0.2132, "step": 5025 }, { "epoch": 0.6875512995896033, - "grad_norm": 0.9842878437213763, + "grad_norm": 0.9758769249894119, "learning_rate": 2.22147886273118e-06, - "loss": 0.1596, + "loss": 0.1602, "step": 5026 }, { "epoch": 0.6876880984952121, - "grad_norm": 1.3862898182582437, + "grad_norm": 1.3899164471159382, "learning_rate": 2.219692623141482e-06, - "loss": 0.2009, + "loss": 0.1965, "step": 5027 }, { "epoch": 0.6878248974008208, - "grad_norm": 1.2300118660881598, + "grad_norm": 1.2127335871852656, "learning_rate": 2.217906897072298e-06, - "loss": 0.2013, + "loss": 0.2003, "step": 5028 }, { "epoch": 0.6879616963064296, - "grad_norm": 1.1813576758321862, + "grad_norm": 1.1707668102842854, "learning_rate": 2.2161216848534485e-06, - "loss": 0.1715, + "loss": 0.1704, "step": 5029 }, { "epoch": 0.6880984952120383, - "grad_norm": 1.349532352925198, + "grad_norm": 1.351224784620925, "learning_rate": 2.2143369868146643e-06, - "loss": 0.1944, + "loss": 0.1962, "step": 5030 }, { "epoch": 0.6882352941176471, - "grad_norm": 1.2481092002950853, + "grad_norm": 1.240210644887588, "learning_rate": 2.2125528032855727e-06, - "loss": 0.1737, + "loss": 0.1736, "step": 5031 }, { "epoch": 0.6883720930232559, - "grad_norm": 1.2232918475504064, + "grad_norm": 1.2090422199215407, "learning_rate": 2.2107691345957133e-06, - "loss": 0.1642, + "loss": 0.163, "step": 5032 }, { "epoch": 0.6885088919288646, - "grad_norm": 1.4991850093069994, + "grad_norm": 1.455797485870031, "learning_rate": 2.2089859810745302e-06, - "loss": 0.2242, + "loss": 0.2206, "step": 5033 }, { "epoch": 0.6886456908344734, - "grad_norm": 1.0296432937068456, + "grad_norm": 1.028185827537176, "learning_rate": 2.207203343051367e-06, - "loss": 0.1499, + "loss": 0.1484, "step": 5034 }, { "epoch": 0.688782489740082, - "grad_norm": 1.2348880689754647, + "grad_norm": 1.243460052657509, "learning_rate": 2.2054212208554783e-06, - "loss": 0.1562, + "loss": 0.1573, "step": 5035 }, { "epoch": 0.6889192886456909, - "grad_norm": 1.1869677235583151, + "grad_norm": 1.1727299048902804, "learning_rate": 2.203639614816017e-06, - "loss": 0.1465, + "loss": 0.1476, "step": 5036 }, { "epoch": 0.6890560875512995, - "grad_norm": 1.1614772558422057, + "grad_norm": 1.1577751745668774, "learning_rate": 2.2018585252620494e-06, - "loss": 0.161, + "loss": 0.1599, "step": 5037 }, { "epoch": 0.6891928864569083, - "grad_norm": 1.3049003086608748, + "grad_norm": 1.3485666633754299, "learning_rate": 2.2000779525225363e-06, - "loss": 0.1682, + "loss": 0.1693, "step": 5038 }, { "epoch": 0.6893296853625172, - "grad_norm": 1.2048514235124672, + "grad_norm": 1.1970116186228374, "learning_rate": 2.19829789692635e-06, - "loss": 0.182, + "loss": 0.1814, "step": 5039 }, { "epoch": 0.6894664842681258, - "grad_norm": 1.1310872163277788, + "grad_norm": 1.13474072659, "learning_rate": 2.196518358802268e-06, - "loss": 0.1474, + "loss": 0.1483, "step": 5040 }, { "epoch": 0.6896032831737346, - "grad_norm": 1.5706975831671264, + "grad_norm": 1.5308501084171398, "learning_rate": 2.194739338478965e-06, - "loss": 0.1926, + "loss": 0.1902, "step": 5041 }, { "epoch": 0.6897400820793433, - "grad_norm": 1.420097253095563, + "grad_norm": 1.4167603656998646, "learning_rate": 2.192960836285027e-06, - "loss": 0.1803, + "loss": 0.1805, "step": 5042 }, { "epoch": 0.6898768809849521, - "grad_norm": 1.2965742892846672, + "grad_norm": 1.2827423455123343, "learning_rate": 2.191182852548941e-06, - "loss": 0.1739, + "loss": 0.174, "step": 5043 }, { "epoch": 0.6900136798905608, - "grad_norm": 1.3670271825745781, + "grad_norm": 1.3483365717907243, "learning_rate": 2.1894053875991016e-06, - "loss": 0.2018, + "loss": 0.2013, "step": 5044 }, { "epoch": 0.6901504787961696, - "grad_norm": 1.3970209727834724, + "grad_norm": 1.4027442961744967, "learning_rate": 2.1876284417638015e-06, - "loss": 0.229, + "loss": 0.2309, "step": 5045 }, { "epoch": 0.6902872777017783, - "grad_norm": 0.9797706084129482, + "grad_norm": 0.9624075187244702, "learning_rate": 2.1858520153712454e-06, - "loss": 0.1604, + "loss": 0.1595, "step": 5046 }, { "epoch": 0.6904240766073871, - "grad_norm": 0.9840261043080948, + "grad_norm": 0.9643347326516364, "learning_rate": 2.184076108749533e-06, - "loss": 0.117, + "loss": 0.1181, "step": 5047 }, { "epoch": 0.6905608755129959, - "grad_norm": 1.3015688324816992, + "grad_norm": 1.3044112879407785, "learning_rate": 2.182300722226675e-06, - "loss": 0.174, + "loss": 0.1746, "step": 5048 }, { "epoch": 0.6906976744186046, - "grad_norm": 1.1736058126345268, + "grad_norm": 1.179494751358299, "learning_rate": 2.1805258561305865e-06, - "loss": 0.1751, + "loss": 0.1769, "step": 5049 }, { "epoch": 0.6908344733242134, - "grad_norm": 1.4278091283347734, + "grad_norm": 1.423662994861121, "learning_rate": 2.178751510789079e-06, - "loss": 0.2441, + "loss": 0.246, "step": 5050 }, { "epoch": 0.6909712722298221, - "grad_norm": 1.3750079853948909, + "grad_norm": 1.3204785780453212, "learning_rate": 2.1769776865298765e-06, - "loss": 0.197, + "loss": 0.1951, "step": 5051 }, { "epoch": 0.6911080711354309, - "grad_norm": 1.2076019335070576, + "grad_norm": 1.2069937466114935, "learning_rate": 2.1752043836806e-06, - "loss": 0.1713, + "loss": 0.1716, "step": 5052 }, { "epoch": 0.6912448700410396, - "grad_norm": 1.032449918886729, + "grad_norm": 1.0212263738806946, "learning_rate": 2.17343160256878e-06, - "loss": 0.1484, + "loss": 0.1489, "step": 5053 }, { "epoch": 0.6913816689466484, - "grad_norm": 1.2397154838869069, + "grad_norm": 1.2447588771806182, "learning_rate": 2.1716593435218455e-06, - "loss": 0.2048, + "loss": 0.2066, "step": 5054 }, { "epoch": 0.6915184678522572, - "grad_norm": 1.3778475191871618, + "grad_norm": 1.3770054746343157, "learning_rate": 2.169887606867131e-06, - "loss": 0.2348, + "loss": 0.2354, "step": 5055 }, { "epoch": 0.6916552667578659, - "grad_norm": 1.1865056445306121, + "grad_norm": 1.166330302084107, "learning_rate": 2.168116392931878e-06, - "loss": 0.1906, + "loss": 0.1897, "step": 5056 }, { "epoch": 0.6917920656634747, - "grad_norm": 1.1855268663726406, + "grad_norm": 1.18615982176124, "learning_rate": 2.166345702043224e-06, - "loss": 0.1437, + "loss": 0.1432, "step": 5057 }, { "epoch": 0.6919288645690834, - "grad_norm": 0.9891136498287891, + "grad_norm": 0.9984493856635208, "learning_rate": 2.164575534528216e-06, - "loss": 0.1511, + "loss": 0.1525, "step": 5058 }, { "epoch": 0.6920656634746922, - "grad_norm": 1.445988372858522, + "grad_norm": 1.4336203229870215, "learning_rate": 2.1628058907138034e-06, - "loss": 0.1897, + "loss": 0.1863, "step": 5059 }, { "epoch": 0.6922024623803009, - "grad_norm": 1.417915199065232, + "grad_norm": 1.4273176682674507, "learning_rate": 2.1610367709268386e-06, - "loss": 0.1749, + "loss": 0.1764, "step": 5060 }, { "epoch": 0.6923392612859097, - "grad_norm": 1.26824015125994, + "grad_norm": 1.2529530311578476, "learning_rate": 2.159268175494074e-06, - "loss": 0.1937, + "loss": 0.1971, "step": 5061 }, { "epoch": 0.6924760601915184, - "grad_norm": 1.1639297549148042, + "grad_norm": 1.140114644533266, "learning_rate": 2.1575001047421707e-06, - "loss": 0.1972, + "loss": 0.1955, "step": 5062 }, { "epoch": 0.6926128590971272, - "grad_norm": 1.311088390995723, + "grad_norm": 1.3026112428664256, "learning_rate": 2.1557325589976863e-06, - "loss": 0.1919, + "loss": 0.1924, "step": 5063 }, { "epoch": 0.692749658002736, - "grad_norm": 1.44768874279755, + "grad_norm": 1.447549634840484, "learning_rate": 2.1539655385870876e-06, - "loss": 0.2156, + "loss": 0.2185, "step": 5064 }, { "epoch": 0.6928864569083447, - "grad_norm": 1.4786365056710358, + "grad_norm": 1.4693477424487051, "learning_rate": 2.152199043836743e-06, - "loss": 0.2469, + "loss": 0.2465, "step": 5065 }, { "epoch": 0.6930232558139535, - "grad_norm": 1.24393474032984, + "grad_norm": 1.2380376469514134, "learning_rate": 2.1504330750729185e-06, - "loss": 0.1491, + "loss": 0.1487, "step": 5066 }, { "epoch": 0.6931600547195622, - "grad_norm": 1.368473403286215, + "grad_norm": 1.3536866626085895, "learning_rate": 2.148667632621792e-06, - "loss": 0.1967, + "loss": 0.1973, "step": 5067 }, { "epoch": 0.693296853625171, - "grad_norm": 1.2931960088005185, + "grad_norm": 1.2886054267524452, "learning_rate": 2.1469027168094347e-06, - "loss": 0.1894, + "loss": 0.187, "step": 5068 }, { "epoch": 0.6934336525307797, - "grad_norm": 1.025715491146033, + "grad_norm": 1.0182662599730847, "learning_rate": 2.1451383279618294e-06, - "loss": 0.1845, + "loss": 0.1852, "step": 5069 }, { "epoch": 0.6935704514363885, - "grad_norm": 1.1898968037298898, + "grad_norm": 1.2048777057613569, "learning_rate": 2.1433744664048534e-06, - "loss": 0.1786, + "loss": 0.1797, "step": 5070 }, { "epoch": 0.6937072503419973, - "grad_norm": 1.2140670033460013, + "grad_norm": 1.1946694359012788, "learning_rate": 2.141611132464292e-06, - "loss": 0.1656, + "loss": 0.1659, "step": 5071 }, { "epoch": 0.693844049247606, - "grad_norm": 0.832666107084905, + "grad_norm": 0.8403237528223471, "learning_rate": 2.1398483264658317e-06, - "loss": 0.1489, + "loss": 0.15, "step": 5072 }, { "epoch": 0.6939808481532148, - "grad_norm": 1.2369574220687791, + "grad_norm": 1.2453282696901722, "learning_rate": 2.1380860487350612e-06, - "loss": 0.1877, + "loss": 0.1887, "step": 5073 }, { "epoch": 0.6941176470588235, - "grad_norm": 1.0255128248494914, + "grad_norm": 1.0086876882452573, "learning_rate": 2.136324299597474e-06, - "loss": 0.1672, + "loss": 0.1673, "step": 5074 }, { "epoch": 0.6942544459644323, - "grad_norm": 1.4750222500540018, + "grad_norm": 1.4752411456948977, "learning_rate": 2.13456307937846e-06, - "loss": 0.1844, + "loss": 0.1832, "step": 5075 }, { "epoch": 0.694391244870041, - "grad_norm": 1.36409876784506, + "grad_norm": 1.3405726026914082, "learning_rate": 2.132802388403319e-06, - "loss": 0.2062, + "loss": 0.2051, "step": 5076 }, { "epoch": 0.6945280437756498, - "grad_norm": 1.1013023333414216, + "grad_norm": 1.0924811984747378, "learning_rate": 2.1310422269972446e-06, - "loss": 0.151, + "loss": 0.1511, "step": 5077 }, { "epoch": 0.6946648426812585, - "grad_norm": 1.397202299765206, + "grad_norm": 1.3991358520311692, "learning_rate": 2.129282595485342e-06, - "loss": 0.2488, + "loss": 0.2495, "step": 5078 }, { "epoch": 0.6948016415868673, - "grad_norm": 1.278361867337168, + "grad_norm": 1.2751450712393866, "learning_rate": 2.127523494192609e-06, - "loss": 0.1553, + "loss": 0.1562, "step": 5079 }, { "epoch": 0.6949384404924761, - "grad_norm": 1.2344227355475161, + "grad_norm": 1.2188832059767611, "learning_rate": 2.1257649234439532e-06, - "loss": 0.1777, + "loss": 0.1796, "step": 5080 }, { "epoch": 0.6950752393980848, - "grad_norm": 0.9602386514543164, + "grad_norm": 0.9568149228794791, "learning_rate": 2.1240068835641815e-06, - "loss": 0.1616, + "loss": 0.1624, "step": 5081 }, { "epoch": 0.6952120383036936, - "grad_norm": 1.169458876438898, + "grad_norm": 1.1633404490141668, "learning_rate": 2.122249374878e-06, - "loss": 0.1969, + "loss": 0.197, "step": 5082 }, { "epoch": 0.6953488372093023, - "grad_norm": 1.4703528395757266, + "grad_norm": 1.4517557646128623, "learning_rate": 2.120492397710022e-06, - "loss": 0.2013, + "loss": 0.2012, "step": 5083 }, { "epoch": 0.6954856361149111, - "grad_norm": 0.958549204259477, + "grad_norm": 0.9481263576790115, "learning_rate": 2.118735952384757e-06, - "loss": 0.1567, + "loss": 0.1558, "step": 5084 }, { "epoch": 0.6956224350205198, - "grad_norm": 1.0291219919216226, + "grad_norm": 1.0311042937322557, "learning_rate": 2.116980039226621e-06, - "loss": 0.1724, + "loss": 0.1726, "step": 5085 }, { "epoch": 0.6957592339261286, - "grad_norm": 1.1244231088788146, + "grad_norm": 1.116038893531631, "learning_rate": 2.1152246585599302e-06, - "loss": 0.1721, + "loss": 0.171, "step": 5086 }, { "epoch": 0.6958960328317374, - "grad_norm": 1.1509906433400308, + "grad_norm": 1.1463635235589733, "learning_rate": 2.1134698107088985e-06, - "loss": 0.1528, + "loss": 0.1525, "step": 5087 }, { "epoch": 0.6960328317373461, - "grad_norm": 1.2172610580939955, + "grad_norm": 1.2093537288630234, "learning_rate": 2.111715495997648e-06, - "loss": 0.1893, + "loss": 0.1903, "step": 5088 }, { "epoch": 0.6961696306429549, - "grad_norm": 1.6495337337429512, + "grad_norm": 1.7156592372067385, "learning_rate": 2.1099617147501984e-06, - "loss": 0.2027, + "loss": 0.2076, "step": 5089 }, { "epoch": 0.6963064295485636, - "grad_norm": 1.3778529971077924, + "grad_norm": 1.3943275698081732, "learning_rate": 2.1082084672904733e-06, - "loss": 0.1911, + "loss": 0.1912, "step": 5090 }, { "epoch": 0.6964432284541724, - "grad_norm": 1.000929771119005, + "grad_norm": 0.9840923176371444, "learning_rate": 2.1064557539422924e-06, - "loss": 0.1369, + "loss": 0.1379, "step": 5091 }, { "epoch": 0.6965800273597811, - "grad_norm": 1.222878836717423, + "grad_norm": 1.1844739581565398, "learning_rate": 2.1047035750293847e-06, - "loss": 0.2109, + "loss": 0.208, "step": 5092 }, { "epoch": 0.6967168262653899, - "grad_norm": 1.0709842964363592, + "grad_norm": 1.0599909294958336, "learning_rate": 2.1029519308753725e-06, - "loss": 0.1643, + "loss": 0.1656, "step": 5093 }, { "epoch": 0.6968536251709986, - "grad_norm": 1.3332304610069308, + "grad_norm": 1.3042834114345159, "learning_rate": 2.101200821803786e-06, - "loss": 0.2211, + "loss": 0.2196, "step": 5094 }, { "epoch": 0.6969904240766074, - "grad_norm": 1.0586814682799213, + "grad_norm": 1.0524684145037135, "learning_rate": 2.0994502481380506e-06, - "loss": 0.1552, + "loss": 0.1554, "step": 5095 }, { "epoch": 0.6971272229822162, - "grad_norm": 1.2073583521058653, + "grad_norm": 1.2042931345456933, "learning_rate": 2.0977002102014972e-06, - "loss": 0.1781, + "loss": 0.1791, "step": 5096 }, { "epoch": 0.6972640218878249, - "grad_norm": 1.136065218819997, + "grad_norm": 1.122139672795385, "learning_rate": 2.0959507083173585e-06, - "loss": 0.173, + "loss": 0.1725, "step": 5097 }, { "epoch": 0.6974008207934337, - "grad_norm": 1.5965443657958913, + "grad_norm": 1.5816269788182706, "learning_rate": 2.094201742808763e-06, - "loss": 0.2183, + "loss": 0.2196, "step": 5098 }, { "epoch": 0.6975376196990424, - "grad_norm": 1.0922308061172739, + "grad_norm": 1.069772574450769, "learning_rate": 2.0924533139987457e-06, - "loss": 0.1341, + "loss": 0.1327, "step": 5099 }, { "epoch": 0.6976744186046512, - "grad_norm": 1.1939298362592963, + "grad_norm": 1.1662493170479327, "learning_rate": 2.0907054222102367e-06, - "loss": 0.1894, + "loss": 0.19, "step": 5100 }, { "epoch": 0.6976744186046512, - "eval_loss": 0.17686830461025238, - "eval_runtime": 5.9051, - "eval_samples_per_second": 5.08, - "eval_steps_per_second": 1.355, + "eval_loss": 0.1768636703491211, + "eval_runtime": 5.901, + "eval_samples_per_second": 5.084, + "eval_steps_per_second": 1.356, "step": 5100 }, { "epoch": 0.6978112175102599, - "grad_norm": 1.3370229561487317, + "grad_norm": 1.3287049407391371, "learning_rate": 2.0889580677660725e-06, - "loss": 0.1896, + "loss": 0.1895, "step": 5101 }, { "epoch": 0.6979480164158687, - "grad_norm": 1.5191735293538122, + "grad_norm": 1.5093039259315328, "learning_rate": 2.087211250988988e-06, - "loss": 0.2248, + "loss": 0.2264, "step": 5102 }, { "epoch": 0.6980848153214775, - "grad_norm": 1.4831813033925234, + "grad_norm": 1.456907091293045, "learning_rate": 2.0854649722016177e-06, - "loss": 0.1816, + "loss": 0.1803, "step": 5103 }, { "epoch": 0.6982216142270862, - "grad_norm": 1.2428472911133885, + "grad_norm": 1.242565751865994, "learning_rate": 2.0837192317265015e-06, - "loss": 0.1647, + "loss": 0.1646, "step": 5104 }, { "epoch": 0.698358413132695, - "grad_norm": 1.182951756434941, + "grad_norm": 1.170610324524034, "learning_rate": 2.0819740298860713e-06, - "loss": 0.1815, + "loss": 0.1825, "step": 5105 }, { "epoch": 0.6984952120383037, - "grad_norm": 1.128776183937581, + "grad_norm": 1.1125288585129514, "learning_rate": 2.0802293670026685e-06, - "loss": 0.1709, + "loss": 0.1703, "step": 5106 }, { "epoch": 0.6986320109439125, - "grad_norm": 1.3975490312323262, + "grad_norm": 1.394946086427425, "learning_rate": 2.0784852433985273e-06, - "loss": 0.2094, + "loss": 0.2062, "step": 5107 }, { "epoch": 0.6987688098495212, - "grad_norm": 1.3363145206748852, + "grad_norm": 1.3259301396826813, "learning_rate": 2.0767416593957897e-06, - "loss": 0.2294, + "loss": 0.2271, "step": 5108 }, { "epoch": 0.69890560875513, - "grad_norm": 1.1833307404037443, + "grad_norm": 1.1575322800583419, "learning_rate": 2.0749986153164903e-06, - "loss": 0.191, + "loss": 0.192, "step": 5109 }, { "epoch": 0.6990424076607387, - "grad_norm": 1.4386511789673648, + "grad_norm": 1.4336982842285153, "learning_rate": 2.073256111482572e-06, - "loss": 0.1802, + "loss": 0.1805, "step": 5110 }, { "epoch": 0.6991792065663475, - "grad_norm": 1.2024266086111466, + "grad_norm": 1.1789337176083374, "learning_rate": 2.071514148215869e-06, - "loss": 0.1733, + "loss": 0.1731, "step": 5111 }, { "epoch": 0.6993160054719563, - "grad_norm": 1.3913215146016156, + "grad_norm": 1.3798702552910125, "learning_rate": 2.069772725838124e-06, - "loss": 0.2105, + "loss": 0.2094, "step": 5112 }, { "epoch": 0.699452804377565, - "grad_norm": 1.0447083841752234, + "grad_norm": 1.0248215829837526, "learning_rate": 2.068031844670977e-06, - "loss": 0.1697, + "loss": 0.1676, "step": 5113 }, { "epoch": 0.6995896032831738, - "grad_norm": 1.135940061353998, + "grad_norm": 1.1328861080286332, "learning_rate": 2.0662915050359638e-06, - "loss": 0.1568, + "loss": 0.1564, "step": 5114 }, { "epoch": 0.6997264021887825, - "grad_norm": 1.2557338868687307, + "grad_norm": 1.238607022235422, "learning_rate": 2.064551707254526e-06, - "loss": 0.1904, + "loss": 0.1905, "step": 5115 }, { "epoch": 0.6998632010943913, - "grad_norm": 1.2247475110028738, + "grad_norm": 1.226213824396342, "learning_rate": 2.0628124516480047e-06, - "loss": 0.1695, + "loss": 0.1716, "step": 5116 }, { "epoch": 0.7, - "grad_norm": 1.4986058326436449, + "grad_norm": 1.4951732031453617, "learning_rate": 2.061073738537635e-06, - "loss": 0.2067, + "loss": 0.2108, "step": 5117 }, { "epoch": 0.7001367989056088, - "grad_norm": 1.0928446454032654, + "grad_norm": 1.109845415933303, "learning_rate": 2.0593355682445577e-06, - "loss": 0.1487, + "loss": 0.149, "step": 5118 }, { "epoch": 0.7002735978112176, - "grad_norm": 1.5828623585337018, + "grad_norm": 1.6142359936302841, "learning_rate": 2.057597941089811e-06, - "loss": 0.2292, + "loss": 0.2321, "step": 5119 }, { "epoch": 0.7004103967168263, - "grad_norm": 1.0956958164597412, + "grad_norm": 1.0890336707136958, "learning_rate": 2.0558608573943356e-06, - "loss": 0.1531, + "loss": 0.1543, "step": 5120 }, { "epoch": 0.7005471956224351, - "grad_norm": 1.255894706860968, + "grad_norm": 1.2532513442805906, "learning_rate": 2.054124317478966e-06, - "loss": 0.1641, + "loss": 0.1639, "step": 5121 }, { "epoch": 0.7006839945280438, - "grad_norm": 1.426057433786986, + "grad_norm": 1.410671802577999, "learning_rate": 2.052388321664443e-06, - "loss": 0.159, + "loss": 0.1587, "step": 5122 }, { "epoch": 0.7008207934336526, - "grad_norm": 1.3277544445116722, + "grad_norm": 1.3527609772860891, "learning_rate": 2.0506528702713996e-06, - "loss": 0.1927, + "loss": 0.1955, "step": 5123 }, { "epoch": 0.7009575923392612, - "grad_norm": 1.1686952670112427, + "grad_norm": 1.0942518762152835, "learning_rate": 2.0489179636203767e-06, - "loss": 0.1832, + "loss": 0.1852, "step": 5124 }, { "epoch": 0.70109439124487, - "grad_norm": 1.0730299266260088, + "grad_norm": 1.0805271720658827, "learning_rate": 2.0471836020318058e-06, - "loss": 0.1634, + "loss": 0.1651, "step": 5125 }, { "epoch": 0.7012311901504787, - "grad_norm": 1.3903817836815033, + "grad_norm": 1.2716430297265842, "learning_rate": 2.0454497858260265e-06, - "loss": 0.1893, + "loss": 0.187, "step": 5126 }, { "epoch": 0.7013679890560875, - "grad_norm": 1.8536403862752486, + "grad_norm": 1.6037763900810817, "learning_rate": 2.043716515323269e-06, - "loss": 0.2333, + "loss": 0.2295, "step": 5127 }, { "epoch": 0.7015047879616964, - "grad_norm": 1.349915878759756, + "grad_norm": 1.3336990673382076, "learning_rate": 2.041983790843669e-06, - "loss": 0.1968, + "loss": 0.1985, "step": 5128 }, { "epoch": 0.701641586867305, - "grad_norm": 1.316535094363592, + "grad_norm": 1.2979410241848524, "learning_rate": 2.0402516127072606e-06, - "loss": 0.1808, + "loss": 0.1805, "step": 5129 }, { "epoch": 0.7017783857729138, - "grad_norm": 1.3954236244883644, + "grad_norm": 1.4049648601535802, "learning_rate": 2.038519981233973e-06, - "loss": 0.171, + "loss": 0.1748, "step": 5130 }, { "epoch": 0.7019151846785225, - "grad_norm": 1.4774684821034687, + "grad_norm": 1.4528239121869606, "learning_rate": 2.0367888967436383e-06, - "loss": 0.2109, + "loss": 0.2118, "step": 5131 }, { "epoch": 0.7020519835841313, - "grad_norm": 1.0687337733374338, + "grad_norm": 1.0506368743714296, "learning_rate": 2.0350583595559865e-06, - "loss": 0.1594, + "loss": 0.158, "step": 5132 }, { "epoch": 0.70218878248974, - "grad_norm": 1.2771575497062473, + "grad_norm": 1.2910110788450353, "learning_rate": 2.033328369990648e-06, "loss": 0.1907, "step": 5133 }, { "epoch": 0.7023255813953488, - "grad_norm": 1.564176010518347, + "grad_norm": 1.5677104481269828, "learning_rate": 2.0315989283671474e-06, - "loss": 0.1979, + "loss": 0.1972, "step": 5134 }, { "epoch": 0.7024623803009576, - "grad_norm": 1.1809764777336527, + "grad_norm": 1.1867579300952857, "learning_rate": 2.029870035004913e-06, - "loss": 0.1929, + "loss": 0.1943, "step": 5135 }, { "epoch": 0.7025991792065663, - "grad_norm": 1.2088344468435808, + "grad_norm": 1.2226493721902127, "learning_rate": 2.028141690223271e-06, - "loss": 0.1651, + "loss": 0.1674, "step": 5136 }, { "epoch": 0.7027359781121751, - "grad_norm": 1.2133520134427076, + "grad_norm": 1.207646668018863, "learning_rate": 2.026413894341442e-06, - "loss": 0.1528, + "loss": 0.1526, "step": 5137 }, { "epoch": 0.7028727770177838, - "grad_norm": 1.1908086810993157, + "grad_norm": 1.1778715274491873, "learning_rate": 2.0246866476785524e-06, - "loss": 0.1905, + "loss": 0.1911, "step": 5138 }, { "epoch": 0.7030095759233926, - "grad_norm": 1.439422040550493, + "grad_norm": 1.4281881709443351, "learning_rate": 2.0229599505536196e-06, - "loss": 0.19, + "loss": 0.1904, "step": 5139 }, { "epoch": 0.7031463748290013, - "grad_norm": 1.1352107839445258, + "grad_norm": 1.126805144466953, "learning_rate": 2.021233803285567e-06, "loss": 0.1728, "step": 5140 }, { "epoch": 0.7032831737346101, - "grad_norm": 1.370455830095203, + "grad_norm": 1.3385893441395778, "learning_rate": 2.019508206193208e-06, - "loss": 0.1567, + "loss": 0.1566, "step": 5141 }, { "epoch": 0.7034199726402188, - "grad_norm": 1.4142600494946056, + "grad_norm": 1.4047742254271824, "learning_rate": 2.0177831595952642e-06, - "loss": 0.2253, + "loss": 0.224, "step": 5142 }, { "epoch": 0.7035567715458276, - "grad_norm": 1.4121731142725191, + "grad_norm": 1.4118981184300314, "learning_rate": 2.0160586638103447e-06, - "loss": 0.1961, + "loss": 0.1966, "step": 5143 }, { "epoch": 0.7036935704514364, - "grad_norm": 1.3097050659143745, + "grad_norm": 1.2973259420054237, "learning_rate": 2.0143347191569664e-06, - "loss": 0.1972, + "loss": 0.199, "step": 5144 }, { "epoch": 0.7038303693570451, - "grad_norm": 1.2446789673554908, + "grad_norm": 1.240311219320162, "learning_rate": 2.0126113259535384e-06, - "loss": 0.1836, + "loss": 0.1828, "step": 5145 }, { "epoch": 0.7039671682626539, - "grad_norm": 1.4256989173595973, + "grad_norm": 1.4110162940452993, "learning_rate": 2.0108884845183713e-06, - "loss": 0.196, + "loss": 0.1941, "step": 5146 }, { "epoch": 0.7041039671682626, - "grad_norm": 1.3382692643787666, + "grad_norm": 1.3341465282836418, "learning_rate": 2.009166195169674e-06, - "loss": 0.2033, + "loss": 0.2026, "step": 5147 }, { "epoch": 0.7042407660738714, - "grad_norm": 1.4052691094362608, + "grad_norm": 1.421735217791103, "learning_rate": 2.0074444582255485e-06, - "loss": 0.1937, + "loss": 0.1966, "step": 5148 }, { "epoch": 0.7043775649794801, - "grad_norm": 1.4177932860586637, + "grad_norm": 1.40826072291441, "learning_rate": 2.005723274004002e-06, - "loss": 0.2176, + "loss": 0.2181, "step": 5149 }, { "epoch": 0.7045143638850889, - "grad_norm": 1.3517716509273372, + "grad_norm": 1.333176534901993, "learning_rate": 2.0040026428229313e-06, - "loss": 0.2123, + "loss": 0.2137, "step": 5150 }, { "epoch": 0.7046511627906977, - "grad_norm": 1.1634921404374778, + "grad_norm": 1.1534945478956984, "learning_rate": 2.0022825650001385e-06, - "loss": 0.1809, + "loss": 0.1811, "step": 5151 }, { "epoch": 0.7047879616963064, - "grad_norm": 1.3006044553880631, + "grad_norm": 1.2896379624850405, "learning_rate": 2.0005630408533215e-06, - "loss": 0.1928, + "loss": 0.1924, "step": 5152 }, { "epoch": 0.7049247606019152, - "grad_norm": 1.0164804448839682, + "grad_norm": 1.0176043537919028, "learning_rate": 1.998844070700072e-06, - "loss": 0.1546, + "loss": 0.153, "step": 5153 }, { "epoch": 0.7050615595075239, - "grad_norm": 1.2131083066899413, + "grad_norm": 1.1915584451118912, "learning_rate": 1.997125654857886e-06, - "loss": 0.1713, + "loss": 0.1714, "step": 5154 }, { "epoch": 0.7051983584131327, - "grad_norm": 1.4106632051823482, + "grad_norm": 1.4188993228218998, "learning_rate": 1.9954077936441497e-06, - "loss": 0.2323, + "loss": 0.2337, "step": 5155 }, { "epoch": 0.7053351573187414, - "grad_norm": 1.6047992508736344, + "grad_norm": 1.6094806580065137, "learning_rate": 1.9936904873761538e-06, - "loss": 0.2023, + "loss": 0.2038, "step": 5156 }, { "epoch": 0.7054719562243502, - "grad_norm": 1.2277509275472016, + "grad_norm": 1.2108617187110067, "learning_rate": 1.9919737363710802e-06, - "loss": 0.1943, + "loss": 0.1947, "step": 5157 }, { "epoch": 0.7056087551299589, - "grad_norm": 1.2838634041329327, + "grad_norm": 1.2694354389394606, "learning_rate": 1.990257540946015e-06, - "loss": 0.2123, + "loss": 0.2146, "step": 5158 }, { "epoch": 0.7057455540355677, - "grad_norm": 1.098201339642655, + "grad_norm": 1.1034632035737442, "learning_rate": 1.9885419014179353e-06, - "loss": 0.1321, + "loss": 0.1318, "step": 5159 }, { "epoch": 0.7058823529411765, - "grad_norm": 1.2880904976475913, + "grad_norm": 1.2719243353834617, "learning_rate": 1.9868268181037186e-06, - "loss": 0.1894, + "loss": 0.1899, "step": 5160 }, { "epoch": 0.7060191518467852, - "grad_norm": 1.1514140748717308, + "grad_norm": 1.1384427724761093, "learning_rate": 1.9851122913201404e-06, "loss": 0.1919, "step": 5161 }, { "epoch": 0.706155950752394, - "grad_norm": 1.0755790634358569, + "grad_norm": 1.0618864680216689, "learning_rate": 1.9833983213838725e-06, - "loss": 0.1651, + "loss": 0.1664, "step": 5162 }, { "epoch": 0.7062927496580027, - "grad_norm": 1.1867102349215648, + "grad_norm": 1.1808013392216026, "learning_rate": 1.981684908611485e-06, - "loss": 0.1542, + "loss": 0.1538, "step": 5163 }, { "epoch": 0.7064295485636115, - "grad_norm": 1.2574206162579014, + "grad_norm": 1.2295322662622614, "learning_rate": 1.9799720533194405e-06, - "loss": 0.2107, + "loss": 0.2094, "step": 5164 }, { "epoch": 0.7065663474692202, - "grad_norm": 1.1890647152606904, + "grad_norm": 1.1885139663995954, "learning_rate": 1.978259755824105e-06, - "loss": 0.1695, + "loss": 0.1698, "step": 5165 }, { "epoch": 0.706703146374829, - "grad_norm": 1.221485817595966, + "grad_norm": 1.2121960729595673, "learning_rate": 1.9765480164417363e-06, - "loss": 0.166, + "loss": 0.1656, "step": 5166 }, { "epoch": 0.7068399452804378, - "grad_norm": 1.3607173564961066, + "grad_norm": 1.3417705771628397, "learning_rate": 1.9748368354884916e-06, - "loss": 0.2174, + "loss": 0.219, "step": 5167 }, { "epoch": 0.7069767441860465, - "grad_norm": 1.2116592535507518, + "grad_norm": 1.2075212650533003, "learning_rate": 1.9731262132804275e-06, - "loss": 0.1682, + "loss": 0.1697, "step": 5168 }, { "epoch": 0.7071135430916553, - "grad_norm": 1.3789285498454191, + "grad_norm": 1.4030777060937465, "learning_rate": 1.97141615013349e-06, - "loss": 0.2075, + "loss": 0.2108, "step": 5169 }, { "epoch": 0.707250341997264, - "grad_norm": 1.2201116894286788, + "grad_norm": 1.2156938035246356, "learning_rate": 1.9697066463635306e-06, - "loss": 0.1887, + "loss": 0.1881, "step": 5170 }, { "epoch": 0.7073871409028728, - "grad_norm": 1.094529244834861, + "grad_norm": 1.0766578653546501, "learning_rate": 1.9679977022862895e-06, - "loss": 0.17, + "loss": 0.1697, "step": 5171 }, { "epoch": 0.7075239398084815, - "grad_norm": 1.5873645938525784, + "grad_norm": 1.581221909217219, "learning_rate": 1.966289318217411e-06, - "loss": 0.183, + "loss": 0.1838, "step": 5172 }, { "epoch": 0.7076607387140903, - "grad_norm": 1.311877225215705, + "grad_norm": 1.2953726596765711, "learning_rate": 1.9645814944724278e-06, - "loss": 0.2182, + "loss": 0.2149, "step": 5173 }, { "epoch": 0.707797537619699, - "grad_norm": 1.2104782834521386, + "grad_norm": 1.199700441201702, "learning_rate": 1.9628742313667772e-06, - "loss": 0.1849, + "loss": 0.1854, "step": 5174 }, { "epoch": 0.7079343365253078, - "grad_norm": 1.1522780388594656, + "grad_norm": 1.143935106086662, "learning_rate": 1.9611675292157874e-06, - "loss": 0.184, + "loss": 0.1861, "step": 5175 }, { "epoch": 0.7080711354309166, - "grad_norm": 1.1477864096942654, + "grad_norm": 1.123997581929786, "learning_rate": 1.9594613883346857e-06, - "loss": 0.1823, + "loss": 0.1835, "step": 5176 }, { "epoch": 0.7082079343365253, - "grad_norm": 1.0400129874133168, + "grad_norm": 1.039830645430953, "learning_rate": 1.9577558090385966e-06, - "loss": 0.1417, + "loss": 0.1427, "step": 5177 }, { "epoch": 0.7083447332421341, - "grad_norm": 1.1181264213361415, + "grad_norm": 1.1080200368193622, "learning_rate": 1.956050791642536e-06, - "loss": 0.1864, + "loss": 0.1865, "step": 5178 }, { "epoch": 0.7084815321477428, - "grad_norm": 1.0685260982466303, + "grad_norm": 1.0617317121474072, "learning_rate": 1.9543463364614225e-06, - "loss": 0.1581, + "loss": 0.1582, "step": 5179 }, { "epoch": 0.7086183310533516, - "grad_norm": 1.1648986843323952, + "grad_norm": 1.1478944939769853, "learning_rate": 1.9526424438100643e-06, - "loss": 0.1404, + "loss": 0.1376, "step": 5180 }, { "epoch": 0.7087551299589603, - "grad_norm": 1.5350629799581024, + "grad_norm": 1.5209071124094, "learning_rate": 1.9509391140031718e-06, - "loss": 0.2554, + "loss": 0.2557, "step": 5181 }, { "epoch": 0.7088919288645691, - "grad_norm": 1.269262000126843, + "grad_norm": 1.2709131941469511, "learning_rate": 1.949236347355346e-06, - "loss": 0.2078, + "loss": 0.2093, "step": 5182 }, { "epoch": 0.7090287277701779, - "grad_norm": 1.248712001883067, + "grad_norm": 1.2315278191162546, "learning_rate": 1.9475341441810874e-06, - "loss": 0.1788, + "loss": 0.1782, "step": 5183 }, { "epoch": 0.7091655266757866, - "grad_norm": 1.2503601255386085, + "grad_norm": 1.2369557049412754, "learning_rate": 1.945832504794794e-06, - "loss": 0.189, + "loss": 0.1904, "step": 5184 }, { "epoch": 0.7093023255813954, - "grad_norm": 1.244390583442051, + "grad_norm": 1.221057926130015, "learning_rate": 1.944131429510754e-06, - "loss": 0.1964, + "loss": 0.1982, "step": 5185 }, { "epoch": 0.7094391244870041, - "grad_norm": 1.1171970223109624, + "grad_norm": 1.1224996269428764, "learning_rate": 1.942430918643157e-06, - "loss": 0.168, + "loss": 0.1687, "step": 5186 }, { "epoch": 0.7095759233926129, - "grad_norm": 1.024062103042831, + "grad_norm": 1.023048476353307, "learning_rate": 1.940730972506083e-06, - "loss": 0.1669, + "loss": 0.169, "step": 5187 }, { "epoch": 0.7097127222982216, - "grad_norm": 1.188144624818944, + "grad_norm": 1.219575599957732, "learning_rate": 1.9390315914135125e-06, - "loss": 0.1559, + "loss": 0.1589, "step": 5188 }, { "epoch": 0.7098495212038304, - "grad_norm": 1.0795502988236483, + "grad_norm": 1.0528568905606768, "learning_rate": 1.937332775679322e-06, - "loss": 0.1767, + "loss": 0.1751, "step": 5189 }, { "epoch": 0.7099863201094391, - "grad_norm": 1.3197849121946557, + "grad_norm": 1.3096450507634545, "learning_rate": 1.9356345256172777e-06, - "loss": 0.1821, + "loss": 0.1834, "step": 5190 }, { "epoch": 0.7101231190150479, - "grad_norm": 1.4109399006654673, + "grad_norm": 1.3915216990623094, "learning_rate": 1.933936841541046e-06, - "loss": 0.2232, + "loss": 0.2229, "step": 5191 }, { "epoch": 0.7102599179206567, - "grad_norm": 1.0892335384852347, + "grad_norm": 1.0849742904966309, "learning_rate": 1.9322397237641877e-06, - "loss": 0.156, + "loss": 0.1567, "step": 5192 }, { "epoch": 0.7103967168262654, - "grad_norm": 1.2742135749397911, + "grad_norm": 1.2746524316077446, "learning_rate": 1.930543172600162e-06, - "loss": 0.1667, + "loss": 0.1663, "step": 5193 }, { "epoch": 0.7105335157318742, - "grad_norm": 1.2330674369805588, + "grad_norm": 1.1807609412142752, "learning_rate": 1.928847188362317e-06, - "loss": 0.1889, + "loss": 0.1891, "step": 5194 }, { "epoch": 0.7106703146374829, - "grad_norm": 1.2380449645939984, + "grad_norm": 1.1887378277455571, "learning_rate": 1.9271517713639015e-06, - "loss": 0.1911, + "loss": 0.1893, "step": 5195 }, { "epoch": 0.7108071135430917, - "grad_norm": 1.1704623525299709, + "grad_norm": 1.1863377052269928, "learning_rate": 1.9254569219180553e-06, - "loss": 0.1766, + "loss": 0.1822, "step": 5196 }, { "epoch": 0.7109439124487004, - "grad_norm": 1.2772219687717357, + "grad_norm": 1.2732641998889664, "learning_rate": 1.9237626403378197e-06, - "loss": 0.2263, + "loss": 0.2272, "step": 5197 }, { "epoch": 0.7110807113543092, - "grad_norm": 1.293626515289335, + "grad_norm": 1.3232580984735114, "learning_rate": 1.9220689269361225e-06, - "loss": 0.1765, + "loss": 0.1773, "step": 5198 }, { "epoch": 0.711217510259918, - "grad_norm": 1.3086161489222368, + "grad_norm": 1.2990987231632458, "learning_rate": 1.9203757820257933e-06, - "loss": 0.1988, + "loss": 0.2003, "step": 5199 }, { "epoch": 0.7113543091655267, - "grad_norm": 1.4493740458081503, + "grad_norm": 1.434668924186356, "learning_rate": 1.918683205919557e-06, - "loss": 0.2278, + "loss": 0.2257, "step": 5200 }, { "epoch": 0.7113543091655267, - "eval_loss": 0.17619028687477112, - "eval_runtime": 5.9084, - "eval_samples_per_second": 5.077, - "eval_steps_per_second": 1.354, + "eval_loss": 0.17646805942058563, + "eval_runtime": 5.9108, + "eval_samples_per_second": 5.075, + "eval_steps_per_second": 1.353, "step": 5200 }, { "epoch": 0.7114911080711355, - "grad_norm": 1.4012364256007828, + "grad_norm": 1.390032702994193, "learning_rate": 1.9169911989300266e-06, - "loss": 0.2027, + "loss": 0.2033, "step": 5201 }, { "epoch": 0.7116279069767442, - "grad_norm": 1.3696453740955543, + "grad_norm": 1.3646519885895154, "learning_rate": 1.9152997613697184e-06, - "loss": 0.2256, + "loss": 0.2268, "step": 5202 }, { "epoch": 0.711764705882353, - "grad_norm": 1.2573817781698409, + "grad_norm": 1.2460107475662467, "learning_rate": 1.913608893551036e-06, - "loss": 0.1784, + "loss": 0.1771, "step": 5203 }, { "epoch": 0.7119015047879617, - "grad_norm": 1.2850518032654934, + "grad_norm": 1.2737560595259705, "learning_rate": 1.9119185957862834e-06, "loss": 0.1752, "step": 5204 }, { "epoch": 0.7120383036935705, - "grad_norm": 1.3906983490278075, + "grad_norm": 1.3686642046445172, "learning_rate": 1.910228868387656e-06, - "loss": 0.2211, + "loss": 0.2189, "step": 5205 }, { "epoch": 0.7121751025991792, - "grad_norm": 1.06805959400468, + "grad_norm": 1.0599008795632185, "learning_rate": 1.9085397116672487e-06, - "loss": 0.1907, + "loss": 0.1918, "step": 5206 }, { "epoch": 0.712311901504788, - "grad_norm": 1.4016556590573181, + "grad_norm": 1.4009252073152219, "learning_rate": 1.9068511259370426e-06, - "loss": 0.1858, + "loss": 0.1864, "step": 5207 }, { "epoch": 0.7124487004103968, - "grad_norm": 1.187255788873262, + "grad_norm": 1.183882277076258, "learning_rate": 1.9051631115089197e-06, - "loss": 0.151, + "loss": 0.1511, "step": 5208 }, { "epoch": 0.7125854993160055, - "grad_norm": 1.5368621182545679, + "grad_norm": 1.5021580728846093, "learning_rate": 1.9034756686946571e-06, - "loss": 0.1958, + "loss": 0.1972, "step": 5209 }, { "epoch": 0.7127222982216143, - "grad_norm": 1.016833526105318, + "grad_norm": 1.0023568873904134, "learning_rate": 1.9017887978059208e-06, - "loss": 0.144, + "loss": 0.1435, "step": 5210 }, { "epoch": 0.712859097127223, - "grad_norm": 1.1024025480985715, + "grad_norm": 1.0897568726564286, "learning_rate": 1.9001024991542783e-06, - "loss": 0.1467, + "loss": 0.1471, "step": 5211 }, { "epoch": 0.7129958960328318, - "grad_norm": 1.1252546012322664, + "grad_norm": 1.1150913473452249, "learning_rate": 1.8984167730511827e-06, - "loss": 0.1563, + "loss": 0.1568, "step": 5212 }, { "epoch": 0.7131326949384404, - "grad_norm": 1.3485394718604173, + "grad_norm": 1.3314401774167197, "learning_rate": 1.896731619807991e-06, - "loss": 0.1664, + "loss": 0.1665, "step": 5213 }, { "epoch": 0.7132694938440493, - "grad_norm": 1.1509099193788292, + "grad_norm": 1.2328832459668289, "learning_rate": 1.8950470397359456e-06, - "loss": 0.1606, + "loss": 0.1613, "step": 5214 }, { "epoch": 0.713406292749658, - "grad_norm": 1.2728762534399063, + "grad_norm": 1.2679560528653158, "learning_rate": 1.8933630331461888e-06, - "loss": 0.1691, + "loss": 0.1684, "step": 5215 }, { "epoch": 0.7135430916552667, - "grad_norm": 1.40583312566215, + "grad_norm": 1.4054836226012235, "learning_rate": 1.8916796003497572e-06, - "loss": 0.1821, + "loss": 0.183, "step": 5216 }, { "epoch": 0.7136798905608756, - "grad_norm": 1.2501450000200776, + "grad_norm": 1.2449231207132476, "learning_rate": 1.8899967416575765e-06, - "loss": 0.1612, + "loss": 0.1605, "step": 5217 }, { "epoch": 0.7138166894664842, - "grad_norm": 1.086319291647032, + "grad_norm": 1.0711564303542556, "learning_rate": 1.88831445738047e-06, - "loss": 0.1595, + "loss": 0.1619, "step": 5218 }, { "epoch": 0.713953488372093, - "grad_norm": 1.3275296392391076, + "grad_norm": 1.334505409715773, "learning_rate": 1.8866327478291546e-06, - "loss": 0.2088, + "loss": 0.2114, "step": 5219 }, { "epoch": 0.7140902872777017, - "grad_norm": 1.1570333184572947, + "grad_norm": 1.1519845168409133, "learning_rate": 1.8849516133142432e-06, - "loss": 0.1802, + "loss": 0.1806, "step": 5220 }, { "epoch": 0.7142270861833105, - "grad_norm": 1.4388055381817777, + "grad_norm": 1.4163056729093353, "learning_rate": 1.8832710541462352e-06, - "loss": 0.2256, + "loss": 0.2243, "step": 5221 }, { "epoch": 0.7143638850889192, - "grad_norm": 1.367686722627366, + "grad_norm": 1.3716191729870972, "learning_rate": 1.8815910706355334e-06, - "loss": 0.2013, + "loss": 0.2014, "step": 5222 }, { "epoch": 0.714500683994528, - "grad_norm": 1.4601643688755273, + "grad_norm": 1.4496891499839386, "learning_rate": 1.8799116630924247e-06, - "loss": 0.2514, + "loss": 0.253, "step": 5223 }, { "epoch": 0.7146374829001368, - "grad_norm": 1.2965133454042983, + "grad_norm": 1.2616808587118278, "learning_rate": 1.8782328318270964e-06, - "loss": 0.1913, + "loss": 0.1905, "step": 5224 }, { "epoch": 0.7147742818057455, - "grad_norm": 1.086160711314804, + "grad_norm": 1.0716660822644797, "learning_rate": 1.876554577149629e-06, - "loss": 0.1571, + "loss": 0.1555, "step": 5225 }, { "epoch": 0.7149110807113543, - "grad_norm": 1.364139850175248, + "grad_norm": 1.3679010158138503, "learning_rate": 1.8748768993699918e-06, - "loss": 0.1625, + "loss": 0.1636, "step": 5226 }, { "epoch": 0.715047879616963, - "grad_norm": 1.2796819193683875, + "grad_norm": 1.272486931589072, "learning_rate": 1.8731997987980537e-06, - "loss": 0.1762, + "loss": 0.1759, "step": 5227 }, { "epoch": 0.7151846785225718, - "grad_norm": 1.3190240323940954, + "grad_norm": 1.300415368260709, "learning_rate": 1.8715232757435702e-06, - "loss": 0.1751, + "loss": 0.1731, "step": 5228 }, { "epoch": 0.7153214774281805, - "grad_norm": 1.2948284145027045, + "grad_norm": 1.2790086152586093, "learning_rate": 1.8698473305161967e-06, - "loss": 0.1996, + "loss": 0.1988, "step": 5229 }, { "epoch": 0.7154582763337893, - "grad_norm": 1.2693766962374706, + "grad_norm": 1.2567820665653655, "learning_rate": 1.868171963425477e-06, - "loss": 0.1757, + "loss": 0.1759, "step": 5230 }, { "epoch": 0.7155950752393981, - "grad_norm": 1.237193216016648, + "grad_norm": 1.2228267265063166, "learning_rate": 1.8664971747808502e-06, - "loss": 0.1903, + "loss": 0.1908, "step": 5231 }, { "epoch": 0.7157318741450068, - "grad_norm": 1.1194995612605327, + "grad_norm": 1.1267708643805523, "learning_rate": 1.864822964891651e-06, - "loss": 0.1675, + "loss": 0.1677, "step": 5232 }, { "epoch": 0.7158686730506156, - "grad_norm": 1.0958362736619824, + "grad_norm": 1.0887777406374413, "learning_rate": 1.863149334067101e-06, - "loss": 0.1432, + "loss": 0.1446, "step": 5233 }, { "epoch": 0.7160054719562243, - "grad_norm": 1.3455190786585363, + "grad_norm": 1.3359084838608233, "learning_rate": 1.8614762826163196e-06, - "loss": 0.1796, + "loss": 0.1815, "step": 5234 }, { "epoch": 0.7161422708618331, - "grad_norm": 1.6156747296962406, + "grad_norm": 1.582713126848437, "learning_rate": 1.8598038108483186e-06, - "loss": 0.2057, + "loss": 0.2059, "step": 5235 }, { "epoch": 0.7162790697674418, - "grad_norm": 0.9696031600114453, + "grad_norm": 0.9578188446474585, "learning_rate": 1.8581319190720038e-06, - "loss": 0.138, + "loss": 0.1372, "step": 5236 }, { "epoch": 0.7164158686730506, - "grad_norm": 1.091600898607706, + "grad_norm": 1.093086443289162, "learning_rate": 1.8564606075961694e-06, - "loss": 0.1518, + "loss": 0.15, "step": 5237 }, { "epoch": 0.7165526675786593, - "grad_norm": 1.302812707941037, + "grad_norm": 1.280843176748792, "learning_rate": 1.8547898767295075e-06, - "loss": 0.2329, + "loss": 0.2315, "step": 5238 }, { "epoch": 0.7166894664842681, - "grad_norm": 1.0762244597168376, + "grad_norm": 1.07591872053208, "learning_rate": 1.8531197267805983e-06, - "loss": 0.1579, + "loss": 0.1591, "step": 5239 }, { "epoch": 0.7168262653898769, - "grad_norm": 1.028659302738743, + "grad_norm": 1.0250770474570678, "learning_rate": 1.851450158057918e-06, - "loss": 0.1578, + "loss": 0.1588, "step": 5240 }, { "epoch": 0.7169630642954856, - "grad_norm": 1.0510915888578323, + "grad_norm": 1.0199956961157934, "learning_rate": 1.8497811708698376e-06, - "loss": 0.1534, + "loss": 0.1522, "step": 5241 }, { "epoch": 0.7170998632010944, - "grad_norm": 1.0237740265528843, + "grad_norm": 1.008487267724842, "learning_rate": 1.8481127655246128e-06, - "loss": 0.1638, + "loss": 0.164, "step": 5242 }, { "epoch": 0.7172366621067031, - "grad_norm": 1.249029141310959, + "grad_norm": 1.2364391100403254, "learning_rate": 1.8464449423304009e-06, - "loss": 0.1827, + "loss": 0.1821, "step": 5243 }, { "epoch": 0.7173734610123119, - "grad_norm": 1.3643396320591927, + "grad_norm": 1.320833882342954, "learning_rate": 1.844777701595244e-06, - "loss": 0.2095, + "loss": 0.2085, "step": 5244 }, { "epoch": 0.7175102599179206, - "grad_norm": 1.274716398520943, + "grad_norm": 1.2809214922877514, "learning_rate": 1.8431110436270832e-06, - "loss": 0.1919, + "loss": 0.1933, "step": 5245 }, { "epoch": 0.7176470588235294, - "grad_norm": 1.0198643556454532, + "grad_norm": 1.0183941307433328, "learning_rate": 1.8414449687337467e-06, - "loss": 0.1796, + "loss": 0.1806, "step": 5246 }, { "epoch": 0.7177838577291382, - "grad_norm": 1.3192231321959356, + "grad_norm": 1.3187814606782875, "learning_rate": 1.8397794772229566e-06, - "loss": 0.1992, + "loss": 0.1978, "step": 5247 }, { "epoch": 0.7179206566347469, - "grad_norm": 1.0964588530729367, + "grad_norm": 1.0889684412907057, "learning_rate": 1.83811456940233e-06, "loss": 0.1632, "step": 5248 }, { "epoch": 0.7180574555403557, - "grad_norm": 1.3094809303255361, + "grad_norm": 1.3121108597620428, "learning_rate": 1.836450245579373e-06, - "loss": 0.2142, + "loss": 0.2157, "step": 5249 }, { "epoch": 0.7181942544459644, - "grad_norm": 1.3480888005252751, + "grad_norm": 1.3469271561086742, "learning_rate": 1.8347865060614867e-06, - "loss": 0.1778, + "loss": 0.1775, "step": 5250 }, { "epoch": 0.7183310533515732, - "grad_norm": 1.6143617056990502, + "grad_norm": 1.6348979714302643, "learning_rate": 1.8331233511559586e-06, - "loss": 0.2063, + "loss": 0.2064, "step": 5251 }, { "epoch": 0.7184678522571819, - "grad_norm": 1.1970649297207063, + "grad_norm": 1.205461554177123, "learning_rate": 1.8314607811699763e-06, - "loss": 0.1754, + "loss": 0.1772, "step": 5252 }, { "epoch": 0.7186046511627907, - "grad_norm": 1.2862226921688493, + "grad_norm": 1.2771796630672232, "learning_rate": 1.8297987964106118e-06, - "loss": 0.2064, + "loss": 0.2072, "step": 5253 }, { "epoch": 0.7187414500683994, - "grad_norm": 1.4529664291163114, + "grad_norm": 1.4347728081093294, "learning_rate": 1.8281373971848348e-06, - "loss": 0.2005, + "loss": 0.2045, "step": 5254 }, { "epoch": 0.7188782489740082, - "grad_norm": 1.2593324146559624, + "grad_norm": 1.255655623686189, "learning_rate": 1.8264765837995013e-06, - "loss": 0.1698, + "loss": 0.1688, "step": 5255 }, { "epoch": 0.719015047879617, - "grad_norm": 1.1553577989449295, + "grad_norm": 1.1309702293483859, "learning_rate": 1.8248163565613642e-06, - "loss": 0.1805, + "loss": 0.1797, "step": 5256 }, { "epoch": 0.7191518467852257, - "grad_norm": 1.5473070972305867, + "grad_norm": 1.4977792190770203, "learning_rate": 1.823156715777068e-06, - "loss": 0.2297, + "loss": 0.2319, "step": 5257 }, { "epoch": 0.7192886456908345, - "grad_norm": 1.2939709489348563, + "grad_norm": 1.3094676470771152, "learning_rate": 1.8214976617531427e-06, - "loss": 0.2033, + "loss": 0.2055, "step": 5258 }, { "epoch": 0.7194254445964432, - "grad_norm": 1.2206881577555069, + "grad_norm": 1.19218731633164, "learning_rate": 1.8198391947960187e-06, - "loss": 0.1848, + "loss": 0.183, "step": 5259 }, { "epoch": 0.719562243502052, - "grad_norm": 1.264426928045833, + "grad_norm": 1.2528630262844112, "learning_rate": 1.8181813152120093e-06, - "loss": 0.1833, + "loss": 0.1824, "step": 5260 }, { "epoch": 0.7196990424076607, - "grad_norm": 1.4530438119962668, + "grad_norm": 1.4590648201703753, "learning_rate": 1.8165240233073272e-06, - "loss": 0.2083, + "loss": 0.2099, "step": 5261 }, { "epoch": 0.7198358413132695, - "grad_norm": 1.2794168465752451, + "grad_norm": 1.2627997087790612, "learning_rate": 1.8148673193880695e-06, - "loss": 0.1724, + "loss": 0.1706, "step": 5262 }, { "epoch": 0.7199726402188783, - "grad_norm": 1.2820846622274054, + "grad_norm": 1.2645125670041466, "learning_rate": 1.81321120376023e-06, - "loss": 0.1731, + "loss": 0.1717, "step": 5263 }, { "epoch": 0.720109439124487, - "grad_norm": 1.3964066375701538, + "grad_norm": 1.384914973786164, "learning_rate": 1.8115556767296917e-06, - "loss": 0.178, + "loss": 0.1783, "step": 5264 }, { "epoch": 0.7202462380300958, - "grad_norm": 1.0784322826956698, + "grad_norm": 1.0656594007910742, "learning_rate": 1.8099007386022287e-06, - "loss": 0.1629, + "loss": 0.1628, "step": 5265 }, { "epoch": 0.7203830369357045, - "grad_norm": 1.345541915253626, + "grad_norm": 1.326650797062521, "learning_rate": 1.808246389683509e-06, - "loss": 0.1881, + "loss": 0.1866, "step": 5266 }, { "epoch": 0.7205198358413133, - "grad_norm": 1.2585526319622298, + "grad_norm": 1.220126693382854, "learning_rate": 1.8065926302790859e-06, - "loss": 0.2013, + "loss": 0.2001, "step": 5267 }, { "epoch": 0.720656634746922, - "grad_norm": 1.3071744455803573, + "grad_norm": 1.3043200240664328, "learning_rate": 1.804939460694411e-06, - "loss": 0.207, + "loss": 0.2101, "step": 5268 }, { "epoch": 0.7207934336525308, - "grad_norm": 1.4718230757241242, + "grad_norm": 1.4635578887537546, "learning_rate": 1.8032868812348192e-06, - "loss": 0.2216, + "loss": 0.222, "step": 5269 }, { "epoch": 0.7209302325581395, - "grad_norm": 1.1221196478024593, + "grad_norm": 1.1077652753482605, "learning_rate": 1.8016348922055448e-06, - "loss": 0.1623, + "loss": 0.1628, "step": 5270 }, { "epoch": 0.7210670314637483, - "grad_norm": 1.2423998917071932, + "grad_norm": 1.248247996429645, "learning_rate": 1.7999834939117055e-06, - "loss": 0.1786, + "loss": 0.1773, "step": 5271 }, { "epoch": 0.7212038303693571, - "grad_norm": 0.9449344835976353, + "grad_norm": 0.9380315844655583, "learning_rate": 1.7983326866583144e-06, - "loss": 0.119, + "loss": 0.1195, "step": 5272 }, { "epoch": 0.7213406292749658, - "grad_norm": 1.201489594826588, + "grad_norm": 1.2073574905417597, "learning_rate": 1.7966824707502762e-06, - "loss": 0.1712, + "loss": 0.1718, "step": 5273 }, { "epoch": 0.7214774281805746, - "grad_norm": 1.1736798040742698, + "grad_norm": 1.1616030215895738, "learning_rate": 1.795032846492381e-06, - "loss": 0.1646, + "loss": 0.1634, "step": 5274 }, { "epoch": 0.7216142270861833, - "grad_norm": 1.239442656155344, + "grad_norm": 1.2339739020375515, "learning_rate": 1.7933838141893167e-06, - "loss": 0.1785, + "loss": 0.1798, "step": 5275 }, { "epoch": 0.7217510259917921, - "grad_norm": 1.3070468078863395, + "grad_norm": 1.3083673268726905, "learning_rate": 1.7917353741456544e-06, - "loss": 0.1753, + "loss": 0.1766, "step": 5276 }, { "epoch": 0.7218878248974008, - "grad_norm": 1.079630541368585, + "grad_norm": 1.0626939414644632, "learning_rate": 1.7900875266658614e-06, - "loss": 0.1459, + "loss": 0.1447, "step": 5277 }, { "epoch": 0.7220246238030096, - "grad_norm": 1.0927390939616406, + "grad_norm": 1.08940388172255, "learning_rate": 1.7884402720542943e-06, - "loss": 0.1978, + "loss": 0.1976, "step": 5278 }, { "epoch": 0.7221614227086184, - "grad_norm": 1.1268108669646844, + "grad_norm": 1.1109395031464493, "learning_rate": 1.7867936106151989e-06, - "loss": 0.1566, + "loss": 0.1558, "step": 5279 }, { "epoch": 0.7222982216142271, - "grad_norm": 1.1605634261108717, + "grad_norm": 1.1479941032202472, "learning_rate": 1.7851475426527142e-06, - "loss": 0.1627, + "loss": 0.1639, "step": 5280 }, { "epoch": 0.7224350205198359, - "grad_norm": 1.1644600631480482, + "grad_norm": 1.1432935735803804, "learning_rate": 1.7835020684708648e-06, - "loss": 0.1498, + "loss": 0.1483, "step": 5281 }, { "epoch": 0.7225718194254446, - "grad_norm": 1.2230163145604358, + "grad_norm": 1.20789954539076, "learning_rate": 1.7818571883735713e-06, - "loss": 0.175, + "loss": 0.1749, "step": 5282 }, { "epoch": 0.7227086183310534, - "grad_norm": 1.4405490461985326, + "grad_norm": 1.411139776321018, "learning_rate": 1.7802129026646387e-06, - "loss": 0.2333, + "loss": 0.234, "step": 5283 }, { "epoch": 0.7228454172366621, - "grad_norm": 1.2807669812262734, + "grad_norm": 1.2623436632825822, "learning_rate": 1.7785692116477683e-06, - "loss": 0.2027, + "loss": 0.2009, "step": 5284 }, { "epoch": 0.7229822161422709, - "grad_norm": 1.422617521454685, + "grad_norm": 1.4028587689776535, "learning_rate": 1.776926115626545e-06, - "loss": 0.1979, + "loss": 0.198, "step": 5285 }, { "epoch": 0.7231190150478796, - "grad_norm": 1.2820608874554054, + "grad_norm": 1.271420656433411, "learning_rate": 1.7752836149044506e-06, - "loss": 0.1964, + "loss": 0.1958, "step": 5286 }, { "epoch": 0.7232558139534884, - "grad_norm": 1.2394758691122767, + "grad_norm": 1.2092160494313897, "learning_rate": 1.7736417097848508e-06, - "loss": 0.1735, + "loss": 0.1725, "step": 5287 }, { "epoch": 0.7233926128590972, - "grad_norm": 1.1167140862326226, + "grad_norm": 1.1097714074561553, "learning_rate": 1.7720004005710051e-06, - "loss": 0.1852, + "loss": 0.1838, "step": 5288 }, { "epoch": 0.7235294117647059, - "grad_norm": 1.0832093723154872, + "grad_norm": 1.072635349546464, "learning_rate": 1.7703596875660645e-06, - "loss": 0.1524, + "loss": 0.1534, "step": 5289 }, { "epoch": 0.7236662106703147, - "grad_norm": 1.1318388181525787, + "grad_norm": 1.122186609101623, "learning_rate": 1.7687195710730636e-06, - "loss": 0.1742, + "loss": 0.1747, "step": 5290 }, { "epoch": 0.7238030095759234, - "grad_norm": 1.2432011113276522, + "grad_norm": 1.228982225344912, "learning_rate": 1.7670800513949327e-06, - "loss": 0.1975, + "loss": 0.1976, "step": 5291 }, { "epoch": 0.7239398084815322, - "grad_norm": 1.0784553617485775, + "grad_norm": 1.0508649507634191, "learning_rate": 1.7654411288344903e-06, - "loss": 0.1671, + "loss": 0.1659, "step": 5292 }, { "epoch": 0.7240766073871409, - "grad_norm": 1.324337117004219, + "grad_norm": 1.3072613805073685, "learning_rate": 1.7638028036944415e-06, - "loss": 0.2076, + "loss": 0.2067, "step": 5293 }, { "epoch": 0.7242134062927497, - "grad_norm": 1.251873155168911, + "grad_norm": 1.2420470543115105, "learning_rate": 1.7621650762773857e-06, - "loss": 0.1721, + "loss": 0.1719, "step": 5294 }, { "epoch": 0.7243502051983585, - "grad_norm": 1.1608046865028936, + "grad_norm": 1.165968011944183, "learning_rate": 1.760527946885809e-06, - "loss": 0.1608, + "loss": 0.1618, "step": 5295 }, { "epoch": 0.7244870041039672, - "grad_norm": 1.3091922021260172, + "grad_norm": 1.293609283047762, "learning_rate": 1.7588914158220898e-06, - "loss": 0.1805, + "loss": 0.1811, "step": 5296 }, { "epoch": 0.724623803009576, - "grad_norm": 1.409917318800876, + "grad_norm": 1.425188762755506, "learning_rate": 1.7572554833884909e-06, - "loss": 0.2311, + "loss": 0.2345, "step": 5297 }, { "epoch": 0.7247606019151847, - "grad_norm": 1.326907089298024, + "grad_norm": 1.3069412992150164, "learning_rate": 1.7556201498871706e-06, - "loss": 0.2103, + "loss": 0.209, "step": 5298 }, { "epoch": 0.7248974008207935, - "grad_norm": 1.1271180311314497, + "grad_norm": 1.1244521452802223, "learning_rate": 1.7539854156201708e-06, - "loss": 0.1627, + "loss": 0.1632, "step": 5299 }, { "epoch": 0.7250341997264022, - "grad_norm": 1.3956561272456893, + "grad_norm": 1.3786928559848062, "learning_rate": 1.7523512808894289e-06, - "loss": 0.2075, + "loss": 0.2059, "step": 5300 }, { "epoch": 0.7250341997264022, - "eval_loss": 0.17525255680084229, - "eval_runtime": 5.9027, - "eval_samples_per_second": 5.082, - "eval_steps_per_second": 1.355, + "eval_loss": 0.17567278444766998, + "eval_runtime": 5.9267, + "eval_samples_per_second": 5.062, + "eval_steps_per_second": 1.35, "step": 5300 }, { "epoch": 0.725170998632011, - "grad_norm": 1.423551622517622, + "grad_norm": 1.4074950207283674, "learning_rate": 1.750717745996765e-06, - "loss": 0.2271, + "loss": 0.2282, "step": 5301 }, { "epoch": 0.7253077975376196, - "grad_norm": 1.0336104723109, + "grad_norm": 1.0193784308612361, "learning_rate": 1.7490848112438946e-06, - "loss": 0.1315, + "loss": 0.1311, "step": 5302 }, { "epoch": 0.7254445964432285, - "grad_norm": 1.2466162989351521, + "grad_norm": 1.233729830029185, "learning_rate": 1.7474524769324164e-06, - "loss": 0.1378, + "loss": 0.1375, "step": 5303 }, { "epoch": 0.7255813953488373, - "grad_norm": 1.179570688813979, + "grad_norm": 1.1733744010929785, "learning_rate": 1.7458207433638225e-06, - "loss": 0.1531, + "loss": 0.1536, "step": 5304 }, { "epoch": 0.725718194254446, - "grad_norm": 1.0624981472017345, + "grad_norm": 1.0525180042065796, "learning_rate": 1.7441896108394945e-06, - "loss": 0.1611, + "loss": 0.1596, "step": 5305 }, { "epoch": 0.7258549931600548, - "grad_norm": 1.625721901265209, + "grad_norm": 1.5922608031296888, "learning_rate": 1.7425590796606984e-06, - "loss": 0.2122, + "loss": 0.2134, "step": 5306 }, { "epoch": 0.7259917920656634, - "grad_norm": 1.2404065942365046, + "grad_norm": 1.2385007371510954, "learning_rate": 1.7409291501285936e-06, - "loss": 0.162, + "loss": 0.1625, "step": 5307 }, { "epoch": 0.7261285909712722, - "grad_norm": 1.5039134461748873, + "grad_norm": 1.4760483255962973, "learning_rate": 1.7392998225442265e-06, - "loss": 0.2145, + "loss": 0.2136, "step": 5308 }, { "epoch": 0.7262653898768809, - "grad_norm": 1.2325854867260888, + "grad_norm": 1.2499235774701725, "learning_rate": 1.7376710972085342e-06, - "loss": 0.1737, + "loss": 0.1745, "step": 5309 }, { "epoch": 0.7264021887824897, - "grad_norm": 1.2856658680243427, + "grad_norm": 1.282140071850903, "learning_rate": 1.7360429744223372e-06, - "loss": 0.1711, + "loss": 0.1733, "step": 5310 }, { "epoch": 0.7265389876880985, - "grad_norm": 1.1817111263883904, + "grad_norm": 1.229094514237231, "learning_rate": 1.734415454486351e-06, - "loss": 0.1651, + "loss": 0.1659, "step": 5311 }, { "epoch": 0.7266757865937072, - "grad_norm": 1.5305863413758432, + "grad_norm": 1.5451009445633395, "learning_rate": 1.7327885377011787e-06, - "loss": 0.1644, + "loss": 0.1681, "step": 5312 }, { "epoch": 0.726812585499316, - "grad_norm": 1.2343679300912591, + "grad_norm": 1.2117649785239142, "learning_rate": 1.7311622243673072e-06, - "loss": 0.1824, + "loss": 0.1841, "step": 5313 }, { "epoch": 0.7269493844049247, - "grad_norm": 1.2070212319276734, + "grad_norm": 1.2105559430030866, "learning_rate": 1.7295365147851178e-06, - "loss": 0.192, + "loss": 0.1936, "step": 5314 }, { "epoch": 0.7270861833105335, - "grad_norm": 1.271367313514183, + "grad_norm": 1.245862463820743, "learning_rate": 1.727911409254875e-06, - "loss": 0.1836, + "loss": 0.1838, "step": 5315 }, { "epoch": 0.7272229822161422, - "grad_norm": 1.1146633563117458, + "grad_norm": 1.1025484087315918, "learning_rate": 1.726286908076738e-06, - "loss": 0.1567, + "loss": 0.1561, "step": 5316 }, { "epoch": 0.727359781121751, - "grad_norm": 1.3114790820244275, + "grad_norm": 1.3058231237838285, "learning_rate": 1.7246630115507473e-06, - "loss": 0.1955, + "loss": 0.1952, "step": 5317 }, { "epoch": 0.7274965800273597, - "grad_norm": 1.0961340064514145, + "grad_norm": 1.080183272382098, "learning_rate": 1.7230397199768385e-06, - "loss": 0.1833, + "loss": 0.1827, "step": 5318 }, { "epoch": 0.7276333789329685, - "grad_norm": 1.2575343157016672, + "grad_norm": 1.2556310657818779, "learning_rate": 1.7214170336548287e-06, - "loss": 0.1827, + "loss": 0.1825, "step": 5319 }, { "epoch": 0.7277701778385773, - "grad_norm": 1.3852056924713718, + "grad_norm": 1.3468506916353071, "learning_rate": 1.7197949528844288e-06, - "loss": 0.182, + "loss": 0.1806, "step": 5320 }, { "epoch": 0.727906976744186, - "grad_norm": 1.259339284986606, + "grad_norm": 1.253317407121101, "learning_rate": 1.718173477965236e-06, - "loss": 0.183, + "loss": 0.1815, "step": 5321 }, { "epoch": 0.7280437756497948, - "grad_norm": 1.3555732794847353, + "grad_norm": 1.3317980385230397, "learning_rate": 1.7165526091967344e-06, - "loss": 0.1926, + "loss": 0.1909, "step": 5322 }, { "epoch": 0.7281805745554035, - "grad_norm": 1.3590965489333557, + "grad_norm": 1.333887465085124, "learning_rate": 1.7149323468783002e-06, - "loss": 0.1869, + "loss": 0.1857, "step": 5323 }, { "epoch": 0.7283173734610123, - "grad_norm": 1.5472696735104008, + "grad_norm": 1.5258736198429725, "learning_rate": 1.7133126913091903e-06, - "loss": 0.198, + "loss": 0.1963, "step": 5324 }, { "epoch": 0.728454172366621, - "grad_norm": 1.2108685750091157, + "grad_norm": 1.1995483898097858, "learning_rate": 1.7116936427885573e-06, - "loss": 0.1711, + "loss": 0.1716, "step": 5325 }, { "epoch": 0.7285909712722298, - "grad_norm": 1.2218341227393081, + "grad_norm": 1.221802847466682, "learning_rate": 1.7100752016154347e-06, - "loss": 0.182, + "loss": 0.1822, "step": 5326 }, { "epoch": 0.7287277701778386, - "grad_norm": 1.2588405592105925, + "grad_norm": 1.21498715934082, "learning_rate": 1.7084573680887495e-06, - "loss": 0.169, + "loss": 0.1688, "step": 5327 }, { "epoch": 0.7288645690834473, - "grad_norm": 1.1429331973517138, + "grad_norm": 1.1428841028499417, "learning_rate": 1.706840142507315e-06, - "loss": 0.161, + "loss": 0.1631, "step": 5328 }, { "epoch": 0.7290013679890561, - "grad_norm": 1.3299461949919191, + "grad_norm": 1.3254050515713574, "learning_rate": 1.7052235251698285e-06, - "loss": 0.1914, + "loss": 0.1911, "step": 5329 }, { "epoch": 0.7291381668946648, - "grad_norm": 1.1550394813927591, + "grad_norm": 1.1384814178418752, "learning_rate": 1.7036075163748816e-06, - "loss": 0.1888, + "loss": 0.1897, "step": 5330 }, { "epoch": 0.7292749658002736, - "grad_norm": 1.2950409796225213, + "grad_norm": 1.2733185684689068, "learning_rate": 1.7019921164209463e-06, - "loss": 0.1958, + "loss": 0.195, "step": 5331 }, { "epoch": 0.7294117647058823, - "grad_norm": 1.1856464084333782, + "grad_norm": 1.180046501594538, "learning_rate": 1.7003773256063882e-06, - "loss": 0.168, + "loss": 0.1673, "step": 5332 }, { "epoch": 0.7295485636114911, - "grad_norm": 1.515601861835033, + "grad_norm": 1.485227567793618, "learning_rate": 1.698763144229456e-06, - "loss": 0.1768, + "loss": 0.1786, "step": 5333 }, { "epoch": 0.7296853625170998, - "grad_norm": 1.2500785119818694, + "grad_norm": 1.2457777703903914, "learning_rate": 1.6971495725882896e-06, - "loss": 0.2037, + "loss": 0.2028, "step": 5334 }, { "epoch": 0.7298221614227086, - "grad_norm": 1.315275895794454, + "grad_norm": 1.2536581012244687, "learning_rate": 1.6955366109809124e-06, - "loss": 0.1783, + "loss": 0.1757, "step": 5335 }, { "epoch": 0.7299589603283174, - "grad_norm": 1.1535776174969936, + "grad_norm": 1.127390708485215, "learning_rate": 1.6939242597052375e-06, - "loss": 0.1562, + "loss": 0.1552, "step": 5336 }, { "epoch": 0.7300957592339261, - "grad_norm": 1.267440340912529, + "grad_norm": 1.2426674292164421, "learning_rate": 1.6923125190590656e-06, - "loss": 0.1832, + "loss": 0.1806, "step": 5337 }, { "epoch": 0.7302325581395349, - "grad_norm": 1.3911056611536456, + "grad_norm": 1.3819348455362863, "learning_rate": 1.6907013893400838e-06, - "loss": 0.2143, + "loss": 0.2136, "step": 5338 }, { "epoch": 0.7303693570451436, - "grad_norm": 1.196343864392962, + "grad_norm": 1.1883220893379616, "learning_rate": 1.6890908708458676e-06, - "loss": 0.1633, + "loss": 0.1662, "step": 5339 }, { "epoch": 0.7305061559507524, - "grad_norm": 1.0357895345401549, + "grad_norm": 1.0298441751533467, "learning_rate": 1.6874809638738754e-06, - "loss": 0.1415, + "loss": 0.1423, "step": 5340 }, { "epoch": 0.7306429548563611, - "grad_norm": 1.240248290175734, + "grad_norm": 1.2299833251825616, "learning_rate": 1.6858716687214598e-06, - "loss": 0.1645, + "loss": 0.1631, "step": 5341 }, { "epoch": 0.7307797537619699, - "grad_norm": 1.203127345882005, + "grad_norm": 1.1962473720070896, "learning_rate": 1.6842629856858517e-06, - "loss": 0.1874, + "loss": 0.1871, "step": 5342 }, { "epoch": 0.7309165526675787, - "grad_norm": 1.1703941523656185, + "grad_norm": 1.1558638663953427, "learning_rate": 1.6826549150641763e-06, - "loss": 0.1624, + "loss": 0.1625, "step": 5343 }, { "epoch": 0.7310533515731874, - "grad_norm": 1.2544905684191816, + "grad_norm": 1.2325702103616436, "learning_rate": 1.681047457153444e-06, - "loss": 0.1713, + "loss": 0.1698, "step": 5344 }, { "epoch": 0.7311901504787962, - "grad_norm": 1.1565717455500923, + "grad_norm": 1.140073205574924, "learning_rate": 1.679440612250548e-06, - "loss": 0.1441, + "loss": 0.145, "step": 5345 }, { "epoch": 0.7313269493844049, - "grad_norm": 1.083993087474635, + "grad_norm": 1.0665152296387534, "learning_rate": 1.677834380652274e-06, - "loss": 0.1592, + "loss": 0.1595, "step": 5346 }, { "epoch": 0.7314637482900137, - "grad_norm": 1.357842303836658, + "grad_norm": 1.340779233580686, "learning_rate": 1.6762287626552891e-06, - "loss": 0.2051, + "loss": 0.203, "step": 5347 }, { "epoch": 0.7316005471956224, - "grad_norm": 0.8644595054667695, + "grad_norm": 0.8568555027313428, "learning_rate": 1.6746237585561525e-06, - "loss": 0.1243, + "loss": 0.1255, "step": 5348 }, { "epoch": 0.7317373461012312, - "grad_norm": 1.0881347709040872, + "grad_norm": 1.0764682779598465, "learning_rate": 1.673019368651304e-06, - "loss": 0.1464, + "loss": 0.1467, "step": 5349 }, { "epoch": 0.7318741450068399, - "grad_norm": 1.3028512512622161, + "grad_norm": 1.308072891516971, "learning_rate": 1.6714155932370745e-06, - "loss": 0.1762, + "loss": 0.1759, "step": 5350 }, { "epoch": 0.7320109439124487, - "grad_norm": 1.4030323351688883, + "grad_norm": 1.3845694401122104, "learning_rate": 1.6698124326096804e-06, - "loss": 0.2433, + "loss": 0.2438, "step": 5351 }, { "epoch": 0.7321477428180575, - "grad_norm": 1.3331265696465049, + "grad_norm": 1.307697758918896, "learning_rate": 1.6682098870652236e-06, - "loss": 0.1816, + "loss": 0.1801, "step": 5352 }, { "epoch": 0.7322845417236662, - "grad_norm": 1.2713735560679136, + "grad_norm": 1.2661107081051257, "learning_rate": 1.6666079568996952e-06, - "loss": 0.1654, + "loss": 0.1659, "step": 5353 }, { "epoch": 0.732421340629275, - "grad_norm": 1.0897321057988707, + "grad_norm": 1.081013365126467, "learning_rate": 1.6650066424089673e-06, - "loss": 0.1497, + "loss": 0.1491, "step": 5354 }, { "epoch": 0.7325581395348837, - "grad_norm": 1.2317646486780953, + "grad_norm": 1.2267484476891077, "learning_rate": 1.6634059438888034e-06, - "loss": 0.1928, + "loss": 0.1938, "step": 5355 }, { "epoch": 0.7326949384404925, - "grad_norm": 1.2446418886070565, + "grad_norm": 1.2425417770935592, "learning_rate": 1.6618058616348492e-06, - "loss": 0.204, + "loss": 0.2036, "step": 5356 }, { "epoch": 0.7328317373461012, - "grad_norm": 1.3427925479245784, + "grad_norm": 1.3287834256151396, "learning_rate": 1.6602063959426418e-06, - "loss": 0.1691, + "loss": 0.169, "step": 5357 }, { "epoch": 0.73296853625171, - "grad_norm": 1.2630888737549932, + "grad_norm": 1.2865176695439982, "learning_rate": 1.6586075471075974e-06, - "loss": 0.141, + "loss": 0.143, "step": 5358 }, { "epoch": 0.7331053351573188, - "grad_norm": 1.0762015580387323, + "grad_norm": 1.071607431622756, "learning_rate": 1.6570093154250238e-06, - "loss": 0.1735, + "loss": 0.1739, "step": 5359 }, { "epoch": 0.7332421340629275, - "grad_norm": 1.5193173308246437, + "grad_norm": 1.4914446681144424, "learning_rate": 1.655411701190115e-06, - "loss": 0.2177, + "loss": 0.2162, "step": 5360 }, { "epoch": 0.7333789329685363, - "grad_norm": 1.1254817322736965, + "grad_norm": 1.1069583648630195, "learning_rate": 1.6538147046979453e-06, - "loss": 0.1402, + "loss": 0.1381, "step": 5361 }, { "epoch": 0.733515731874145, - "grad_norm": 1.4611001830603425, + "grad_norm": 1.4739043676449228, "learning_rate": 1.6522183262434826e-06, - "loss": 0.2004, + "loss": 0.2008, "step": 5362 }, { "epoch": 0.7336525307797538, - "grad_norm": 1.2984510405072724, + "grad_norm": 1.2895847631915571, "learning_rate": 1.650622566121573e-06, - "loss": 0.2072, + "loss": 0.2084, "step": 5363 }, { "epoch": 0.7337893296853625, - "grad_norm": 1.3246052716589118, + "grad_norm": 1.324630929701794, "learning_rate": 1.6490274246269532e-06, - "loss": 0.1763, + "loss": 0.1785, "step": 5364 }, { "epoch": 0.7339261285909713, - "grad_norm": 1.2045345751757894, + "grad_norm": 1.1710174654647572, "learning_rate": 1.6474329020542478e-06, - "loss": 0.1796, + "loss": 0.1762, "step": 5365 }, { "epoch": 0.73406292749658, - "grad_norm": 1.2794467723552811, + "grad_norm": 1.27259037292033, "learning_rate": 1.645838998697959e-06, "loss": 0.1659, "step": 5366 }, { "epoch": 0.7341997264021888, - "grad_norm": 1.4503185114305226, + "grad_norm": 1.4053332264590375, "learning_rate": 1.6442457148524816e-06, - "loss": 0.1971, + "loss": 0.198, "step": 5367 }, { "epoch": 0.7343365253077976, - "grad_norm": 1.169802275743989, + "grad_norm": 1.1522559750160868, "learning_rate": 1.6426530508120942e-06, - "loss": 0.1442, + "loss": 0.1443, "step": 5368 }, { "epoch": 0.7344733242134063, - "grad_norm": 1.370265875127392, + "grad_norm": 1.3482638924396917, "learning_rate": 1.6410610068709615e-06, - "loss": 0.1667, + "loss": 0.1639, "step": 5369 }, { "epoch": 0.7346101231190151, - "grad_norm": 1.4239133033012006, + "grad_norm": 1.3948493652000666, "learning_rate": 1.63946958332313e-06, - "loss": 0.1645, + "loss": 0.1637, "step": 5370 }, { "epoch": 0.7347469220246238, - "grad_norm": 1.1214139672480947, + "grad_norm": 1.1103356908205138, "learning_rate": 1.6378787804625373e-06, - "loss": 0.1559, + "loss": 0.1574, "step": 5371 }, { "epoch": 0.7348837209302326, - "grad_norm": 1.268090817442454, + "grad_norm": 1.2587169594239778, "learning_rate": 1.6362885985830001e-06, - "loss": 0.1901, + "loss": 0.1916, "step": 5372 }, { "epoch": 0.7350205198358413, - "grad_norm": 1.103147881971632, + "grad_norm": 1.0994944476461435, "learning_rate": 1.6346990379782273e-06, - "loss": 0.1867, + "loss": 0.1868, "step": 5373 }, { "epoch": 0.7351573187414501, - "grad_norm": 1.428479296427613, + "grad_norm": 1.4240810412488172, "learning_rate": 1.6331100989418065e-06, - "loss": 0.1803, + "loss": 0.1801, "step": 5374 }, { "epoch": 0.7352941176470589, - "grad_norm": 1.264385959465371, + "grad_norm": 1.250744881848776, "learning_rate": 1.6315217817672142e-06, - "loss": 0.1977, + "loss": 0.1966, "step": 5375 }, { "epoch": 0.7354309165526676, - "grad_norm": 1.4382275215047249, + "grad_norm": 1.4103190675874333, "learning_rate": 1.629934086747813e-06, - "loss": 0.1926, + "loss": 0.1936, "step": 5376 }, { "epoch": 0.7355677154582764, - "grad_norm": 1.557734357553442, + "grad_norm": 1.5434184342352713, "learning_rate": 1.6283470141768459e-06, - "loss": 0.2075, + "loss": 0.2078, "step": 5377 }, { "epoch": 0.7357045143638851, - "grad_norm": 1.375493418277305, + "grad_norm": 1.3719216434831132, "learning_rate": 1.6267605643474478e-06, - "loss": 0.1864, + "loss": 0.1862, "step": 5378 }, { "epoch": 0.7358413132694939, - "grad_norm": 1.1338133335648635, + "grad_norm": 1.1205120474622758, "learning_rate": 1.62517473755263e-06, - "loss": 0.158, + "loss": 0.1582, "step": 5379 }, { "epoch": 0.7359781121751026, - "grad_norm": 1.0902890146277884, + "grad_norm": 1.0832877395955547, "learning_rate": 1.6235895340852964e-06, - "loss": 0.1804, + "loss": 0.1819, "step": 5380 }, { "epoch": 0.7361149110807114, - "grad_norm": 1.2270156542754438, + "grad_norm": 1.2088136452404257, "learning_rate": 1.6220049542382321e-06, "loss": 0.2059, "step": 5381 }, { "epoch": 0.7362517099863201, - "grad_norm": 1.0948161597851238, + "grad_norm": 1.0884450347372574, "learning_rate": 1.6204209983041097e-06, - "loss": 0.1609, + "loss": 0.1629, "step": 5382 }, { "epoch": 0.7363885088919289, - "grad_norm": 1.1999895988243268, + "grad_norm": 1.2095460237239095, "learning_rate": 1.6188376665754813e-06, - "loss": 0.1582, + "loss": 0.1595, "step": 5383 }, { "epoch": 0.7365253077975377, - "grad_norm": 1.286827127362891, + "grad_norm": 1.286702617212767, "learning_rate": 1.6172549593447879e-06, - "loss": 0.1637, + "loss": 0.1646, "step": 5384 }, { "epoch": 0.7366621067031464, - "grad_norm": 1.56738034248072, + "grad_norm": 1.5593420816598698, "learning_rate": 1.6156728769043567e-06, - "loss": 0.2268, + "loss": 0.2296, "step": 5385 }, { "epoch": 0.7367989056087552, - "grad_norm": 1.1813104041690856, + "grad_norm": 1.176602386889726, "learning_rate": 1.6140914195463937e-06, - "loss": 0.1637, + "loss": 0.1641, "step": 5386 }, { "epoch": 0.7369357045143639, - "grad_norm": 1.3202780756786991, + "grad_norm": 1.3233597762320592, "learning_rate": 1.6125105875629955e-06, - "loss": 0.1536, + "loss": 0.1542, "step": 5387 }, { "epoch": 0.7370725034199727, - "grad_norm": 1.3538889720131708, + "grad_norm": 1.3831017523625633, "learning_rate": 1.6109303812461375e-06, - "loss": 0.1875, + "loss": 0.1913, "step": 5388 }, { "epoch": 0.7372093023255814, - "grad_norm": 1.1673231267258533, + "grad_norm": 1.1706356858112223, "learning_rate": 1.6093508008876857e-06, - "loss": 0.1754, + "loss": 0.1764, "step": 5389 }, { "epoch": 0.7373461012311902, - "grad_norm": 1.01175956320201, + "grad_norm": 1.0001685155702233, "learning_rate": 1.6077718467793845e-06, - "loss": 0.1387, + "loss": 0.138, "step": 5390 }, { "epoch": 0.737482900136799, - "grad_norm": 1.4037934675749915, + "grad_norm": 1.390867652425335, "learning_rate": 1.6061935192128669e-06, "loss": 0.2149, "step": 5391 }, { "epoch": 0.7376196990424077, - "grad_norm": 1.0865154732076214, + "grad_norm": 1.0741832007596421, "learning_rate": 1.60461581847965e-06, - "loss": 0.145, + "loss": 0.1454, "step": 5392 }, { "epoch": 0.7377564979480165, - "grad_norm": 1.4072589941366216, + "grad_norm": 1.3990949193208557, "learning_rate": 1.603038744871131e-06, - "loss": 0.2044, + "loss": 0.2042, "step": 5393 }, { "epoch": 0.7378932968536251, - "grad_norm": 1.2666286902445354, + "grad_norm": 1.270109758638412, "learning_rate": 1.601462298678595e-06, - "loss": 0.1779, + "loss": 0.1797, "step": 5394 }, { "epoch": 0.738030095759234, - "grad_norm": 1.4488020547250395, + "grad_norm": 1.4419800192376404, "learning_rate": 1.5998864801932129e-06, "loss": 0.2002, "step": 5395 }, { "epoch": 0.7381668946648426, - "grad_norm": 1.2953438949467435, + "grad_norm": 1.2823299752182673, "learning_rate": 1.5983112897060333e-06, - "loss": 0.2003, + "loss": 0.1988, "step": 5396 }, { "epoch": 0.7383036935704514, - "grad_norm": 1.160672729051515, + "grad_norm": 1.123933967263779, "learning_rate": 1.5967367275079947e-06, - "loss": 0.1627, + "loss": 0.1601, "step": 5397 }, { "epoch": 0.7384404924760601, - "grad_norm": 1.2159187330009122, + "grad_norm": 1.2042746953036882, "learning_rate": 1.5951627938899182e-06, - "loss": 0.165, + "loss": 0.1642, "step": 5398 }, { "epoch": 0.7385772913816689, - "grad_norm": 1.4027815993979627, + "grad_norm": 1.3911865086186537, "learning_rate": 1.5935894891425058e-06, - "loss": 0.1889, + "loss": 0.1883, "step": 5399 }, { "epoch": 0.7387140902872777, - "grad_norm": 1.0670300536506363, + "grad_norm": 1.0514063775297715, "learning_rate": 1.5920168135563468e-06, - "loss": 0.192, + "loss": 0.1917, "step": 5400 }, { "epoch": 0.7387140902872777, - "eval_loss": 0.17596539855003357, - "eval_runtime": 5.9144, - "eval_samples_per_second": 5.072, - "eval_steps_per_second": 1.353, + "eval_loss": 0.17587679624557495, + "eval_runtime": 5.9286, + "eval_samples_per_second": 5.06, + "eval_steps_per_second": 1.349, "step": 5400 }, { "epoch": 0.7388508891928864, - "grad_norm": 1.4440959233789075, + "grad_norm": 1.4362250730103687, "learning_rate": 1.5904447674219149e-06, - "loss": 0.1918, + "loss": 0.1913, "step": 5401 }, { "epoch": 0.7389876880984952, - "grad_norm": 1.1792897672911937, + "grad_norm": 1.1749076007963897, "learning_rate": 1.5888733510295618e-06, - "loss": 0.1569, + "loss": 0.1582, "step": 5402 }, { "epoch": 0.7391244870041039, - "grad_norm": 1.2428003862027874, + "grad_norm": 1.2472082008230687, "learning_rate": 1.5873025646695306e-06, - "loss": 0.1805, + "loss": 0.1795, "step": 5403 }, { "epoch": 0.7392612859097127, - "grad_norm": 1.3549978105666374, + "grad_norm": 1.3350567653255008, "learning_rate": 1.5857324086319414e-06, - "loss": 0.1874, + "loss": 0.1868, "step": 5404 }, { "epoch": 0.7393980848153214, - "grad_norm": 1.0497227026176545, + "grad_norm": 1.0325577723784478, "learning_rate": 1.5841628832068035e-06, - "loss": 0.1527, + "loss": 0.1523, "step": 5405 }, { "epoch": 0.7395348837209302, - "grad_norm": 1.3591330466261533, + "grad_norm": 1.352742936384576, "learning_rate": 1.5825939886840036e-06, - "loss": 0.1983, + "loss": 0.1979, "step": 5406 }, { "epoch": 0.739671682626539, - "grad_norm": 1.3590419528610145, + "grad_norm": 1.3499119624290188, "learning_rate": 1.5810257253533174e-06, - "loss": 0.197, + "loss": 0.1962, "step": 5407 }, { "epoch": 0.7398084815321477, - "grad_norm": 1.468073333293163, + "grad_norm": 1.465179711731547, "learning_rate": 1.579458093504403e-06, - "loss": 0.1763, + "loss": 0.1767, "step": 5408 }, { "epoch": 0.7399452804377565, - "grad_norm": 1.025605219792952, + "grad_norm": 1.0011716387388334, "learning_rate": 1.5778910934267976e-06, - "loss": 0.1409, + "loss": 0.1406, "step": 5409 }, { "epoch": 0.7400820793433652, - "grad_norm": 1.4307650494948279, + "grad_norm": 1.4330637107330622, "learning_rate": 1.576324725409926e-06, - "loss": 0.1959, + "loss": 0.1952, "step": 5410 }, { "epoch": 0.740218878248974, - "grad_norm": 1.2126200991530667, + "grad_norm": 1.1958935698080526, "learning_rate": 1.5747589897430964e-06, - "loss": 0.1554, + "loss": 0.1556, "step": 5411 }, { "epoch": 0.7403556771545827, - "grad_norm": 1.1558666769152484, + "grad_norm": 1.1542772679359214, "learning_rate": 1.5731938867155e-06, - "loss": 0.1684, + "loss": 0.169, "step": 5412 }, { "epoch": 0.7404924760601915, - "grad_norm": 1.2720036925231397, + "grad_norm": 1.25788796238173, "learning_rate": 1.571629416616206e-06, - "loss": 0.1501, + "loss": 0.1507, "step": 5413 }, { "epoch": 0.7406292749658002, - "grad_norm": 1.0936228486089747, + "grad_norm": 1.0790905500997288, "learning_rate": 1.5700655797341747e-06, - "loss": 0.182, + "loss": 0.1829, "step": 5414 }, { "epoch": 0.740766073871409, - "grad_norm": 1.0389986683787065, + "grad_norm": 1.0331765638512886, "learning_rate": 1.5685023763582418e-06, - "loss": 0.1626, + "loss": 0.1623, "step": 5415 }, { "epoch": 0.7409028727770178, - "grad_norm": 1.2863346767219317, + "grad_norm": 1.3000584094775822, "learning_rate": 1.5669398067771324e-06, - "loss": 0.1608, + "loss": 0.1629, "step": 5416 }, { "epoch": 0.7410396716826265, - "grad_norm": 1.1068293406191838, + "grad_norm": 1.111894314098431, "learning_rate": 1.565377871279452e-06, - "loss": 0.147, + "loss": 0.1476, "step": 5417 }, { "epoch": 0.7411764705882353, - "grad_norm": 1.2191058156307772, + "grad_norm": 1.206535766542698, "learning_rate": 1.5638165701536866e-06, - "loss": 0.1687, + "loss": 0.1682, "step": 5418 }, { "epoch": 0.741313269493844, - "grad_norm": 1.3115948715320451, + "grad_norm": 1.3015940172272145, "learning_rate": 1.5622559036882101e-06, - "loss": 0.1849, + "loss": 0.1865, "step": 5419 }, { "epoch": 0.7414500683994528, - "grad_norm": 1.1883320811860973, + "grad_norm": 1.1579175565273505, "learning_rate": 1.5606958721712728e-06, - "loss": 0.1856, + "loss": 0.1849, "step": 5420 }, { "epoch": 0.7415868673050615, - "grad_norm": 1.097760037487925, + "grad_norm": 1.0960740034567178, "learning_rate": 1.5591364758910154e-06, - "loss": 0.1791, + "loss": 0.1811, "step": 5421 }, { "epoch": 0.7417236662106703, - "grad_norm": 1.153779689100006, + "grad_norm": 1.1608365526082993, "learning_rate": 1.5575777151354531e-06, - "loss": 0.171, + "loss": 0.1721, "step": 5422 }, { "epoch": 0.7418604651162791, - "grad_norm": 1.5117275183508267, + "grad_norm": 1.4909925102797021, "learning_rate": 1.5560195901924896e-06, "loss": 0.2148, "step": 5423 }, { "epoch": 0.7419972640218878, - "grad_norm": 1.5671056250052595, + "grad_norm": 1.5547661729268278, "learning_rate": 1.5544621013499095e-06, - "loss": 0.2032, + "loss": 0.2055, "step": 5424 }, { "epoch": 0.7421340629274966, - "grad_norm": 1.4589501202076889, + "grad_norm": 1.4645681015764969, "learning_rate": 1.5529052488953788e-06, "loss": 0.2214, "step": 5425 }, { "epoch": 0.7422708618331053, - "grad_norm": 1.1188788539506327, + "grad_norm": 1.0775622508914373, "learning_rate": 1.5513490331164498e-06, - "loss": 0.179, + "loss": 0.1773, "step": 5426 }, { "epoch": 0.7424076607387141, - "grad_norm": 1.4271309389682572, + "grad_norm": 1.395217214917211, "learning_rate": 1.5497934543005505e-06, - "loss": 0.2023, + "loss": 0.2009, "step": 5427 }, { "epoch": 0.7425444596443228, - "grad_norm": 1.1872352018833003, + "grad_norm": 1.1804306667080071, "learning_rate": 1.548238512734998e-06, - "loss": 0.1648, + "loss": 0.1643, "step": 5428 }, { "epoch": 0.7426812585499316, - "grad_norm": 1.3533354109628444, + "grad_norm": 1.3557214918197682, "learning_rate": 1.546684208706986e-06, - "loss": 0.17, + "loss": 0.1702, "step": 5429 }, { "epoch": 0.7428180574555403, - "grad_norm": 1.3060508202409633, + "grad_norm": 1.301398425241219, "learning_rate": 1.5451305425035961e-06, - "loss": 0.1535, + "loss": 0.1549, "step": 5430 }, { "epoch": 0.7429548563611491, - "grad_norm": 1.2466851880703733, + "grad_norm": 1.2336219048978123, "learning_rate": 1.5435775144117864e-06, - "loss": 0.1363, + "loss": 0.1373, "step": 5431 }, { "epoch": 0.7430916552667579, - "grad_norm": 1.3125237223912194, + "grad_norm": 1.3062074822377936, "learning_rate": 1.542025124718401e-06, - "loss": 0.1996, + "loss": 0.1986, "step": 5432 }, { "epoch": 0.7432284541723666, - "grad_norm": 1.3017199639418162, + "grad_norm": 1.294502944752018, "learning_rate": 1.5404733737101662e-06, - "loss": 0.1934, + "loss": 0.1936, "step": 5433 }, { "epoch": 0.7433652530779754, - "grad_norm": 1.0502540645395204, + "grad_norm": 1.0425929711497601, "learning_rate": 1.538922261673687e-06, - "loss": 0.1808, + "loss": 0.1816, "step": 5434 }, { "epoch": 0.7435020519835841, - "grad_norm": 1.2475070936692414, + "grad_norm": 1.236670428275447, "learning_rate": 1.5373717888954553e-06, - "loss": 0.1887, + "loss": 0.1885, "step": 5435 }, { "epoch": 0.7436388508891929, - "grad_norm": 1.1048901707076695, + "grad_norm": 1.1091484404427752, "learning_rate": 1.5358219556618392e-06, - "loss": 0.1684, + "loss": 0.1694, "step": 5436 }, { "epoch": 0.7437756497948016, - "grad_norm": 1.2325715561559236, + "grad_norm": 1.212433342299315, "learning_rate": 1.5342727622590942e-06, - "loss": 0.1662, + "loss": 0.1657, "step": 5437 }, { "epoch": 0.7439124487004104, - "grad_norm": 1.3226334551708634, + "grad_norm": 1.3086991704502347, "learning_rate": 1.5327242089733523e-06, - "loss": 0.1673, + "loss": 0.1672, "step": 5438 }, { "epoch": 0.7440492476060192, - "grad_norm": 1.1393171193939315, + "grad_norm": 1.1283255938328305, "learning_rate": 1.5311762960906317e-06, - "loss": 0.133, + "loss": 0.1334, "step": 5439 }, { "epoch": 0.7441860465116279, - "grad_norm": 1.1817573984055383, + "grad_norm": 1.1635052638460754, "learning_rate": 1.5296290238968303e-06, - "loss": 0.1559, + "loss": 0.1561, "step": 5440 }, { "epoch": 0.7443228454172367, - "grad_norm": 1.4406373245398987, + "grad_norm": 1.4395361263115258, "learning_rate": 1.5280823926777287e-06, - "loss": 0.1969, + "loss": 0.1991, "step": 5441 }, { "epoch": 0.7444596443228454, - "grad_norm": 1.0617412917297633, + "grad_norm": 1.0451719297241129, "learning_rate": 1.5265364027189893e-06, - "loss": 0.1799, + "loss": 0.1791, "step": 5442 }, { "epoch": 0.7445964432284542, - "grad_norm": 1.365889202993652, + "grad_norm": 1.3362844486566754, "learning_rate": 1.5249910543061524e-06, - "loss": 0.1894, + "loss": 0.1879, "step": 5443 }, { "epoch": 0.7447332421340629, - "grad_norm": 1.4060449217041802, + "grad_norm": 1.379007150946403, "learning_rate": 1.5234463477246454e-06, - "loss": 0.1846, + "loss": 0.182, "step": 5444 }, { "epoch": 0.7448700410396717, - "grad_norm": 1.0870482920993014, + "grad_norm": 1.075738254296704, "learning_rate": 1.5219022832597718e-06, - "loss": 0.1655, + "loss": 0.1643, "step": 5445 }, { "epoch": 0.7450068399452804, - "grad_norm": 1.0461346606170898, + "grad_norm": 1.0343988721340565, "learning_rate": 1.5203588611967213e-06, - "loss": 0.1493, + "loss": 0.1491, "step": 5446 }, { "epoch": 0.7451436388508892, - "grad_norm": 1.144018404945358, + "grad_norm": 1.151674178392758, "learning_rate": 1.5188160818205606e-06, - "loss": 0.1759, + "loss": 0.1769, "step": 5447 }, { "epoch": 0.745280437756498, - "grad_norm": 1.2352345150061184, + "grad_norm": 1.2236648184198722, "learning_rate": 1.5172739454162406e-06, - "loss": 0.1648, + "loss": 0.1653, "step": 5448 }, { "epoch": 0.7454172366621067, - "grad_norm": 1.3299653431718388, + "grad_norm": 1.3343022135189724, "learning_rate": 1.5157324522685945e-06, - "loss": 0.2189, + "loss": 0.2194, "step": 5449 }, { "epoch": 0.7455540355677155, - "grad_norm": 1.2761601094105883, + "grad_norm": 1.2677631765186879, "learning_rate": 1.5141916026623322e-06, - "loss": 0.1563, + "loss": 0.1571, "step": 5450 }, { "epoch": 0.7456908344733242, - "grad_norm": 1.1311289247000693, + "grad_norm": 1.1249502816516903, "learning_rate": 1.5126513968820495e-06, - "loss": 0.1919, + "loss": 0.1932, "step": 5451 }, { "epoch": 0.745827633378933, - "grad_norm": 1.3213312237477521, + "grad_norm": 1.323655604160608, "learning_rate": 1.5111118352122185e-06, - "loss": 0.1771, + "loss": 0.1781, "step": 5452 }, { "epoch": 0.7459644322845417, - "grad_norm": 1.138802387481336, + "grad_norm": 1.1109591098749074, "learning_rate": 1.5095729179371965e-06, "loss": 0.158, "step": 5453 }, { "epoch": 0.7461012311901505, - "grad_norm": 1.350330963128824, + "grad_norm": 1.3275139273856145, "learning_rate": 1.5080346453412204e-06, - "loss": 0.1858, + "loss": 0.1841, "step": 5454 }, { "epoch": 0.7462380300957593, - "grad_norm": 1.1582227936017429, + "grad_norm": 1.15255900244424, "learning_rate": 1.5064970177084075e-06, - "loss": 0.1792, + "loss": 0.1806, "step": 5455 }, { "epoch": 0.746374829001368, - "grad_norm": 1.4126593252228417, + "grad_norm": 1.3925215902934382, "learning_rate": 1.504960035322759e-06, - "loss": 0.2018, + "loss": 0.2009, "step": 5456 }, { "epoch": 0.7465116279069768, - "grad_norm": 1.3189646898780147, + "grad_norm": 1.318746686325122, "learning_rate": 1.5034236984681499e-06, - "loss": 0.221, + "loss": 0.2157, "step": 5457 }, { "epoch": 0.7466484268125855, - "grad_norm": 1.1948533543067101, + "grad_norm": 1.187354407834907, "learning_rate": 1.5018880074283437e-06, - "loss": 0.1747, + "loss": 0.1738, "step": 5458 }, { "epoch": 0.7467852257181943, - "grad_norm": 1.1706424669255908, + "grad_norm": 1.1721125292676189, "learning_rate": 1.5003529624869788e-06, - "loss": 0.171, + "loss": 0.1717, "step": 5459 }, { "epoch": 0.746922024623803, - "grad_norm": 1.3512499856304623, + "grad_norm": 1.3401118694748673, "learning_rate": 1.49881856392758e-06, - "loss": 0.2137, + "loss": 0.2093, "step": 5460 }, { "epoch": 0.7470588235294118, - "grad_norm": 1.3650087392339758, + "grad_norm": 1.3602845863732256, "learning_rate": 1.4972848120335453e-06, - "loss": 0.1966, + "loss": 0.1965, "step": 5461 }, { "epoch": 0.7471956224350205, - "grad_norm": 1.351014412603888, + "grad_norm": 1.338703623001579, "learning_rate": 1.4957517070881617e-06, - "loss": 0.1846, + "loss": 0.1829, "step": 5462 }, { "epoch": 0.7473324213406293, - "grad_norm": 1.3378728819521208, + "grad_norm": 1.3177590640401606, "learning_rate": 1.4942192493745888e-06, - "loss": 0.2151, + "loss": 0.2135, "step": 5463 }, { "epoch": 0.7474692202462381, - "grad_norm": 1.4461889958085177, + "grad_norm": 1.4126024344103352, "learning_rate": 1.4926874391758718e-06, - "loss": 0.2206, + "loss": 0.2175, "step": 5464 }, { "epoch": 0.7476060191518468, - "grad_norm": 1.0979663807485336, + "grad_norm": 1.0801090644128244, "learning_rate": 1.491156276774936e-06, - "loss": 0.1359, + "loss": 0.1368, "step": 5465 }, { "epoch": 0.7477428180574556, - "grad_norm": 1.4229507208035201, + "grad_norm": 1.4161303822535733, "learning_rate": 1.4896257624545828e-06, - "loss": 0.2078, + "loss": 0.2083, "step": 5466 }, { "epoch": 0.7478796169630643, - "grad_norm": 1.1506757064827495, + "grad_norm": 1.1375840954516845, "learning_rate": 1.4880958964974994e-06, - "loss": 0.1983, + "loss": 0.1977, "step": 5467 }, { "epoch": 0.7480164158686731, - "grad_norm": 1.0738574651871373, + "grad_norm": 1.0652092483818854, "learning_rate": 1.486566679186252e-06, - "loss": 0.1691, + "loss": 0.1684, "step": 5468 }, { "epoch": 0.7481532147742818, - "grad_norm": 1.187867160895494, + "grad_norm": 1.178815494455981, "learning_rate": 1.485038110803282e-06, "loss": 0.1876, "step": 5469 }, { "epoch": 0.7482900136798906, - "grad_norm": 1.358463149508234, + "grad_norm": 1.2995345461204542, "learning_rate": 1.483510191630917e-06, - "loss": 0.2083, + "loss": 0.2056, "step": 5470 }, { "epoch": 0.7484268125854994, - "grad_norm": 1.1371603302247515, + "grad_norm": 1.1171081338724749, "learning_rate": 1.4819829219513621e-06, - "loss": 0.1456, + "loss": 0.1454, "step": 5471 }, { "epoch": 0.7485636114911081, - "grad_norm": 1.1111877400375676, + "grad_norm": 1.1153310913528225, "learning_rate": 1.4804563020467045e-06, - "loss": 0.1301, + "loss": 0.1302, "step": 5472 }, { "epoch": 0.7487004103967169, - "grad_norm": 0.9934693711936037, + "grad_norm": 0.9879162361754331, "learning_rate": 1.4789303321989063e-06, - "loss": 0.1626, + "loss": 0.164, "step": 5473 }, { "epoch": 0.7488372093023256, - "grad_norm": 1.2768988619881634, + "grad_norm": 1.2714624414486795, "learning_rate": 1.4774050126898164e-06, - "loss": 0.1985, + "loss": 0.1973, "step": 5474 }, { "epoch": 0.7489740082079344, - "grad_norm": 1.4627321122928685, + "grad_norm": 1.4548295944462108, "learning_rate": 1.4758803438011565e-06, - "loss": 0.1876, + "loss": 0.1882, "step": 5475 }, { "epoch": 0.749110807113543, - "grad_norm": 1.1814547195493668, + "grad_norm": 1.1693138753780485, "learning_rate": 1.4743563258145356e-06, - "loss": 0.1856, + "loss": 0.1849, "step": 5476 }, { "epoch": 0.7492476060191519, - "grad_norm": 1.2564028494598984, + "grad_norm": 1.2913780566963735, "learning_rate": 1.4728329590114342e-06, - "loss": 0.1564, + "loss": 0.1555, "step": 5477 }, { "epoch": 0.7493844049247605, - "grad_norm": 1.089705316278541, + "grad_norm": 1.0882618566935311, "learning_rate": 1.4713102436732207e-06, - "loss": 0.1943, + "loss": 0.196, "step": 5478 }, { "epoch": 0.7495212038303694, - "grad_norm": 1.0998192006354612, + "grad_norm": 1.084145684646221, "learning_rate": 1.469788180081137e-06, - "loss": 0.148, + "loss": 0.1474, "step": 5479 }, { "epoch": 0.7496580027359782, - "grad_norm": 1.3850680884805842, + "grad_norm": 1.3570976487849655, "learning_rate": 1.4682667685163072e-06, - "loss": 0.2107, + "loss": 0.2078, "step": 5480 }, { "epoch": 0.7497948016415868, - "grad_norm": 1.3891365489851932, + "grad_norm": 1.3705255465011124, "learning_rate": 1.4667460092597374e-06, - "loss": 0.2033, + "loss": 0.2042, "step": 5481 }, { "epoch": 0.7499316005471957, - "grad_norm": 0.923361648832035, + "grad_norm": 0.9241010813856934, "learning_rate": 1.4652259025923072e-06, - "loss": 0.162, + "loss": 0.1619, "step": 5482 }, { "epoch": 0.7500683994528043, - "grad_norm": 1.519102518198911, + "grad_norm": 1.494982547007142, "learning_rate": 1.4637064487947805e-06, - "loss": 0.2074, + "loss": 0.2076, "step": 5483 }, { "epoch": 0.7502051983584132, - "grad_norm": 1.0886421702607862, + "grad_norm": 1.0659102025445026, "learning_rate": 1.4621876481477986e-06, - "loss": 0.1424, + "loss": 0.1403, "step": 5484 }, { "epoch": 0.7503419972640218, - "grad_norm": 1.341600169600282, + "grad_norm": 1.3305931136545102, "learning_rate": 1.4606695009318855e-06, - "loss": 0.1896, + "loss": 0.1886, "step": 5485 }, { "epoch": 0.7504787961696306, - "grad_norm": 1.2089067451674125, + "grad_norm": 1.2046063841748336, "learning_rate": 1.4591520074274378e-06, - "loss": 0.1596, + "loss": 0.1595, "step": 5486 }, { "epoch": 0.7506155950752395, - "grad_norm": 1.0521017949189797, + "grad_norm": 1.028415079503842, "learning_rate": 1.4576351679147378e-06, - "loss": 0.1721, + "loss": 0.1713, "step": 5487 }, { "epoch": 0.7507523939808481, - "grad_norm": 1.3697529025497772, + "grad_norm": 1.3603320419769056, "learning_rate": 1.4561189826739447e-06, - "loss": 0.1822, + "loss": 0.1818, "step": 5488 }, { "epoch": 0.750889192886457, - "grad_norm": 1.335348215051181, + "grad_norm": 1.3152289973121134, "learning_rate": 1.4546034519850944e-06, - "loss": 0.1751, + "loss": 0.1753, "step": 5489 }, { "epoch": 0.7510259917920656, - "grad_norm": 1.1763788902813663, + "grad_norm": 1.1623960525515031, "learning_rate": 1.4530885761281071e-06, - "loss": 0.1704, + "loss": 0.1707, "step": 5490 }, { "epoch": 0.7511627906976744, - "grad_norm": 1.110118107533913, + "grad_norm": 1.0890590376956828, "learning_rate": 1.451574355382776e-06, - "loss": 0.1483, + "loss": 0.148, "step": 5491 }, { "epoch": 0.7512995896032831, - "grad_norm": 1.3343785606535998, + "grad_norm": 1.3025499374735765, "learning_rate": 1.45006079002878e-06, - "loss": 0.1749, + "loss": 0.1729, "step": 5492 }, { "epoch": 0.7514363885088919, - "grad_norm": 1.4486514564095812, + "grad_norm": 1.4457273344300072, "learning_rate": 1.44854788034567e-06, - "loss": 0.1708, + "loss": 0.1728, "step": 5493 }, { "epoch": 0.7515731874145006, - "grad_norm": 1.0759472406031994, + "grad_norm": 1.0765669279448382, "learning_rate": 1.4470356266128816e-06, - "loss": 0.1573, + "loss": 0.1597, "step": 5494 }, { "epoch": 0.7517099863201094, - "grad_norm": 1.3746002710628564, + "grad_norm": 1.3366671791497775, "learning_rate": 1.445524029109725e-06, - "loss": 0.2142, + "loss": 0.2148, "step": 5495 }, { "epoch": 0.7518467852257182, - "grad_norm": 1.2621502757365997, + "grad_norm": 1.2405477340631552, "learning_rate": 1.4440130881153918e-06, - "loss": 0.1928, + "loss": 0.1937, "step": 5496 }, { "epoch": 0.7519835841313269, - "grad_norm": 1.28041583128588, + "grad_norm": 1.2551866777480494, "learning_rate": 1.4425028039089518e-06, - "loss": 0.1648, + "loss": 0.1644, "step": 5497 }, { "epoch": 0.7521203830369357, - "grad_norm": 1.2322551727362954, + "grad_norm": 1.2253752722774331, "learning_rate": 1.4409931767693542e-06, - "loss": 0.1842, + "loss": 0.1835, "step": 5498 }, { "epoch": 0.7522571819425444, - "grad_norm": 1.4596119716890301, + "grad_norm": 1.4434231350031077, "learning_rate": 1.439484206975424e-06, - "loss": 0.1594, + "loss": 0.1599, "step": 5499 }, { "epoch": 0.7523939808481532, - "grad_norm": 1.2086896182437523, + "grad_norm": 1.199417844375086, "learning_rate": 1.437975894805867e-06, - "loss": 0.1713, + "loss": 0.1715, "step": 5500 }, { "epoch": 0.7523939808481532, - "eval_loss": 0.17522889375686646, - "eval_runtime": 5.9123, - "eval_samples_per_second": 5.074, - "eval_steps_per_second": 1.353, + "eval_loss": 0.17534872889518738, + "eval_runtime": 5.9181, + "eval_samples_per_second": 5.069, + "eval_steps_per_second": 1.352, "step": 5500 }, { "epoch": 0.7525307797537619, - "grad_norm": 1.0813302398353115, + "grad_norm": 1.0677407970041843, "learning_rate": 1.43646824053927e-06, - "loss": 0.1741, + "loss": 0.174, "step": 5501 }, { "epoch": 0.7526675786593707, - "grad_norm": 1.3919098376541053, + "grad_norm": 1.3977402846225566, "learning_rate": 1.4349612444540912e-06, - "loss": 0.1739, + "loss": 0.1781, "step": 5502 }, { "epoch": 0.7528043775649795, - "grad_norm": 1.4440277826871744, + "grad_norm": 1.4506794630459854, "learning_rate": 1.433454906828674e-06, - "loss": 0.1967, + "loss": 0.1964, "step": 5503 }, { "epoch": 0.7529411764705882, - "grad_norm": 1.270841945431253, + "grad_norm": 1.241237948990492, "learning_rate": 1.4319492279412388e-06, - "loss": 0.1944, + "loss": 0.1911, "step": 5504 }, { "epoch": 0.753077975376197, - "grad_norm": 1.4180693051337994, + "grad_norm": 1.4506793867052417, "learning_rate": 1.4304442080698806e-06, - "loss": 0.2107, + "loss": 0.2152, "step": 5505 }, { "epoch": 0.7532147742818057, - "grad_norm": 1.406364283551131, + "grad_norm": 1.4150445196135573, "learning_rate": 1.428939847492578e-06, - "loss": 0.2218, + "loss": 0.2202, "step": 5506 }, { "epoch": 0.7533515731874145, - "grad_norm": 1.2745918239895062, + "grad_norm": 1.2695574596150399, "learning_rate": 1.4274361464871828e-06, - "loss": 0.1616, + "loss": 0.1623, "step": 5507 }, { "epoch": 0.7534883720930232, - "grad_norm": 1.3490506857902804, + "grad_norm": 1.2977835484672493, "learning_rate": 1.425933105331429e-06, - "loss": 0.194, + "loss": 0.1918, "step": 5508 }, { "epoch": 0.753625170998632, - "grad_norm": 1.2402378647747478, + "grad_norm": 1.225494705569465, "learning_rate": 1.4244307243029254e-06, - "loss": 0.1682, + "loss": 0.1685, "step": 5509 }, { "epoch": 0.7537619699042407, - "grad_norm": 1.1956933132822536, + "grad_norm": 1.1793192436268964, "learning_rate": 1.4229290036791631e-06, - "loss": 0.181, + "loss": 0.1788, "step": 5510 }, { "epoch": 0.7538987688098495, - "grad_norm": 1.3315348323568355, + "grad_norm": 1.3208978096017507, "learning_rate": 1.4214279437375055e-06, - "loss": 0.2002, + "loss": 0.1988, "step": 5511 }, { "epoch": 0.7540355677154583, - "grad_norm": 1.2594293201608509, + "grad_norm": 1.2488110069024514, "learning_rate": 1.419927544755199e-06, - "loss": 0.1904, + "loss": 0.1915, "step": 5512 }, { "epoch": 0.754172366621067, - "grad_norm": 1.538624379616864, + "grad_norm": 1.5260514618508378, "learning_rate": 1.418427807009366e-06, - "loss": 0.2621, + "loss": 0.2645, "step": 5513 }, { "epoch": 0.7543091655266758, - "grad_norm": 1.456994707808155, + "grad_norm": 1.4475171098212014, "learning_rate": 1.4169287307770068e-06, - "loss": 0.2022, + "loss": 0.2029, "step": 5514 }, { "epoch": 0.7544459644322845, - "grad_norm": 1.2725841639502666, + "grad_norm": 1.2676484704193705, "learning_rate": 1.4154303163350008e-06, - "loss": 0.2011, + "loss": 0.1996, "step": 5515 }, { "epoch": 0.7545827633378933, - "grad_norm": 1.2303793824371365, + "grad_norm": 1.2260200862551287, "learning_rate": 1.4139325639601015e-06, - "loss": 0.1731, + "loss": 0.1741, "step": 5516 }, { "epoch": 0.754719562243502, - "grad_norm": 1.2941795431461927, + "grad_norm": 1.2887116677896497, "learning_rate": 1.4124354739289447e-06, - "loss": 0.1883, + "loss": 0.1887, "step": 5517 }, { "epoch": 0.7548563611491108, - "grad_norm": 1.2966330837896896, + "grad_norm": 1.286978688920385, "learning_rate": 1.4109390465180395e-06, - "loss": 0.1878, + "loss": 0.1887, "step": 5518 }, { "epoch": 0.7549931600547196, - "grad_norm": 1.2392876140254634, + "grad_norm": 1.221850371865418, "learning_rate": 1.4094432820037757e-06, - "loss": 0.1858, + "loss": 0.1848, "step": 5519 }, { "epoch": 0.7551299589603283, - "grad_norm": 1.0614616725551949, + "grad_norm": 1.0291159011682067, "learning_rate": 1.4079481806624219e-06, - "loss": 0.1479, + "loss": 0.1476, "step": 5520 }, { "epoch": 0.7552667578659371, - "grad_norm": 1.3331881476011973, + "grad_norm": 1.3243232215179797, "learning_rate": 1.4064537427701187e-06, - "loss": 0.2045, + "loss": 0.2039, "step": 5521 }, { "epoch": 0.7554035567715458, - "grad_norm": 1.317065432830326, + "grad_norm": 1.3114444119139974, "learning_rate": 1.4049599686028909e-06, - "loss": 0.2242, + "loss": 0.2233, "step": 5522 }, { "epoch": 0.7555403556771546, - "grad_norm": 1.375768559109671, + "grad_norm": 1.3591164313762312, "learning_rate": 1.403466858436634e-06, - "loss": 0.2164, + "loss": 0.2172, "step": 5523 }, { "epoch": 0.7556771545827633, - "grad_norm": 1.0414711160361168, + "grad_norm": 1.0540356310102503, "learning_rate": 1.4019744125471274e-06, - "loss": 0.1742, + "loss": 0.1755, "step": 5524 }, { "epoch": 0.7558139534883721, - "grad_norm": 1.187676744840848, + "grad_norm": 1.1579250750887065, "learning_rate": 1.4004826312100218e-06, - "loss": 0.1699, + "loss": 0.1703, "step": 5525 }, { "epoch": 0.7559507523939808, - "grad_norm": 1.571364667370273, + "grad_norm": 1.5471226894010845, "learning_rate": 1.3989915147008492e-06, "loss": 0.2401, "step": 5526 }, { "epoch": 0.7560875512995896, - "grad_norm": 1.3090671031590067, + "grad_norm": 1.3026529617522153, "learning_rate": 1.3975010632950175e-06, - "loss": 0.1836, + "loss": 0.1833, "step": 5527 }, { "epoch": 0.7562243502051984, - "grad_norm": 1.0884215671639899, + "grad_norm": 1.0894223768398335, "learning_rate": 1.3960112772678125e-06, - "loss": 0.1421, + "loss": 0.1423, "step": 5528 }, { "epoch": 0.7563611491108071, - "grad_norm": 1.504085117626106, + "grad_norm": 1.5045616934893693, "learning_rate": 1.3945221568943974e-06, - "loss": 0.2135, + "loss": 0.2131, "step": 5529 }, { "epoch": 0.7564979480164159, - "grad_norm": 1.394575508665998, + "grad_norm": 1.392174035129322, "learning_rate": 1.3930337024498087e-06, - "loss": 0.2051, + "loss": 0.2052, "step": 5530 }, { "epoch": 0.7566347469220246, - "grad_norm": 1.3066092115403467, + "grad_norm": 1.2798079903887827, "learning_rate": 1.3915459142089654e-06, - "loss": 0.195, + "loss": 0.196, "step": 5531 }, { "epoch": 0.7567715458276334, - "grad_norm": 1.0917901150878928, + "grad_norm": 1.0998574437919104, "learning_rate": 1.3900587924466585e-06, - "loss": 0.1725, + "loss": 0.1732, "step": 5532 }, { "epoch": 0.7569083447332421, - "grad_norm": 1.2645240596868468, + "grad_norm": 1.2462341252977012, "learning_rate": 1.388572337437561e-06, - "loss": 0.1812, + "loss": 0.1797, "step": 5533 }, { "epoch": 0.7570451436388509, - "grad_norm": 1.2941240859736611, + "grad_norm": 1.2869442876708748, "learning_rate": 1.3870865494562164e-06, - "loss": 0.1828, + "loss": 0.1834, "step": 5534 }, { "epoch": 0.7571819425444597, - "grad_norm": 1.079959494893173, + "grad_norm": 1.074440742119568, "learning_rate": 1.3856014287770502e-06, - "loss": 0.1773, + "loss": 0.1767, "step": 5535 }, { "epoch": 0.7573187414500684, - "grad_norm": 1.1461008606092975, + "grad_norm": 1.1317713851097775, "learning_rate": 1.384116975674365e-06, "loss": 0.1934, "step": 5536 }, { "epoch": 0.7574555403556772, - "grad_norm": 1.1049005915283536, + "grad_norm": 1.0931274658210932, "learning_rate": 1.3826331904223345e-06, - "loss": 0.1426, + "loss": 0.1424, "step": 5537 }, { "epoch": 0.7575923392612859, - "grad_norm": 1.286278867275359, + "grad_norm": 1.2733604349652567, "learning_rate": 1.3811500732950161e-06, - "loss": 0.1639, + "loss": 0.1645, "step": 5538 }, { "epoch": 0.7577291381668947, - "grad_norm": 1.471485283841714, + "grad_norm": 1.453307830860475, "learning_rate": 1.379667624566337e-06, - "loss": 0.2049, + "loss": 0.2037, "step": 5539 }, { "epoch": 0.7578659370725034, - "grad_norm": 1.1822894875014203, + "grad_norm": 1.1725861796071526, "learning_rate": 1.378185844510107e-06, "loss": 0.1703, "step": 5540 }, { "epoch": 0.7580027359781122, - "grad_norm": 1.106501856842549, + "grad_norm": 1.0876765845930796, "learning_rate": 1.3767047334000073e-06, - "loss": 0.1728, + "loss": 0.1719, "step": 5541 }, { "epoch": 0.7581395348837209, - "grad_norm": 1.3707052087230733, + "grad_norm": 1.363489409905163, "learning_rate": 1.3752242915095993e-06, - "loss": 0.2148, + "loss": 0.2131, "step": 5542 }, { "epoch": 0.7582763337893297, - "grad_norm": 1.1234498767976244, + "grad_norm": 1.120056280927872, "learning_rate": 1.3737445191123188e-06, - "loss": 0.1872, + "loss": 0.1877, "step": 5543 }, { "epoch": 0.7584131326949385, - "grad_norm": 1.261824510468728, + "grad_norm": 1.120394902448227, "learning_rate": 1.3722654164814797e-06, - "loss": 0.1845, + "loss": 0.1843, "step": 5544 }, { "epoch": 0.7585499316005472, - "grad_norm": 1.4696504415225446, + "grad_norm": 1.4623604078097505, "learning_rate": 1.3707869838902716e-06, - "loss": 0.2191, + "loss": 0.2202, "step": 5545 }, { "epoch": 0.758686730506156, - "grad_norm": 1.0934325841819408, + "grad_norm": 1.0836559715545038, "learning_rate": 1.3693092216117565e-06, - "loss": 0.1429, + "loss": 0.144, "step": 5546 }, { "epoch": 0.7588235294117647, - "grad_norm": 1.4237752697023152, + "grad_norm": 1.398981615558789, "learning_rate": 1.3678321299188802e-06, - "loss": 0.1891, + "loss": 0.1881, "step": 5547 }, { "epoch": 0.7589603283173735, - "grad_norm": 1.2814195032385716, + "grad_norm": 1.2706913320406157, "learning_rate": 1.3663557090844559e-06, - "loss": 0.1471, + "loss": 0.1483, "step": 5548 }, { "epoch": 0.7590971272229822, - "grad_norm": 1.4835575316653873, + "grad_norm": 1.496228322809542, "learning_rate": 1.3648799593811819e-06, - "loss": 0.2201, + "loss": 0.2229, "step": 5549 }, { "epoch": 0.759233926128591, - "grad_norm": 1.3417515386922927, + "grad_norm": 1.351706641539309, "learning_rate": 1.3634048810816237e-06, - "loss": 0.1984, + "loss": 0.2011, "step": 5550 }, { "epoch": 0.7593707250341998, - "grad_norm": 1.1591071715310617, + "grad_norm": 1.1528593159875138, "learning_rate": 1.3619304744582284e-06, - "loss": 0.2041, + "loss": 0.2035, "step": 5551 }, { "epoch": 0.7595075239398085, - "grad_norm": 1.0925668806161943, + "grad_norm": 1.1141836649894088, "learning_rate": 1.3604567397833202e-06, - "loss": 0.1501, + "loss": 0.1522, "step": 5552 }, { "epoch": 0.7596443228454173, - "grad_norm": 1.3775730679059688, + "grad_norm": 1.3561581794890751, "learning_rate": 1.3589836773290937e-06, - "loss": 0.1888, + "loss": 0.1877, "step": 5553 }, { "epoch": 0.759781121751026, - "grad_norm": 1.2703942536496071, + "grad_norm": 1.2793071261867155, "learning_rate": 1.3575112873676245e-06, - "loss": 0.1882, + "loss": 0.1878, "step": 5554 }, { "epoch": 0.7599179206566348, - "grad_norm": 1.1370460096459856, + "grad_norm": 1.1282670128767107, "learning_rate": 1.356039570170859e-06, - "loss": 0.1482, + "loss": 0.1475, "step": 5555 }, { "epoch": 0.7600547195622435, - "grad_norm": 1.5247992134102684, + "grad_norm": 1.5450392496742544, "learning_rate": 1.354568526010624e-06, - "loss": 0.186, + "loss": 0.1874, "step": 5556 }, { "epoch": 0.7601915184678523, - "grad_norm": 1.1781102617742263, + "grad_norm": 1.1775645424418542, "learning_rate": 1.3530981551586203e-06, "loss": 0.1844, "step": 5557 }, { "epoch": 0.760328317373461, - "grad_norm": 1.278659190320028, + "grad_norm": 1.278282817545295, "learning_rate": 1.3516284578864252e-06, "loss": 0.1913, "step": 5558 }, { "epoch": 0.7604651162790698, - "grad_norm": 1.1434718822484244, + "grad_norm": 1.1379521107767052, "learning_rate": 1.3501594344654885e-06, - "loss": 0.1687, + "loss": 0.1693, "step": 5559 }, { "epoch": 0.7606019151846786, - "grad_norm": 1.1080020773653219, + "grad_norm": 1.08963787489899, "learning_rate": 1.3486910851671374e-06, - "loss": 0.1789, + "loss": 0.179, "step": 5560 }, { "epoch": 0.7607387140902873, - "grad_norm": 1.2991672623691424, + "grad_norm": 1.302076408443079, "learning_rate": 1.347223410262578e-06, - "loss": 0.193, + "loss": 0.191, "step": 5561 }, { "epoch": 0.7608755129958961, - "grad_norm": 1.186901079336294, + "grad_norm": 1.1945005364027026, "learning_rate": 1.3457564100228853e-06, - "loss": 0.1788, + "loss": 0.1803, "step": 5562 }, { "epoch": 0.7610123119015048, - "grad_norm": 1.4060886118247393, + "grad_norm": 1.413121682709164, "learning_rate": 1.3442900847190154e-06, - "loss": 0.2205, + "loss": 0.221, "step": 5563 }, { "epoch": 0.7611491108071136, - "grad_norm": 1.384340365226586, + "grad_norm": 1.3751020981435214, "learning_rate": 1.342824434621795e-06, - "loss": 0.1584, + "loss": 0.1594, "step": 5564 }, { "epoch": 0.7612859097127223, - "grad_norm": 1.2218672829522594, + "grad_norm": 1.2388624041863587, "learning_rate": 1.3413594600019313e-06, - "loss": 0.1539, + "loss": 0.1565, "step": 5565 }, { "epoch": 0.7614227086183311, - "grad_norm": 1.3357324849434233, + "grad_norm": 1.3347189604147855, "learning_rate": 1.339895161130001e-06, - "loss": 0.1867, + "loss": 0.187, "step": 5566 }, { "epoch": 0.7615595075239399, - "grad_norm": 1.2242537573180887, + "grad_norm": 1.223822504541511, "learning_rate": 1.3384315382764606e-06, - "loss": 0.1634, + "loss": 0.1645, "step": 5567 }, { "epoch": 0.7616963064295486, - "grad_norm": 1.153889851828115, + "grad_norm": 1.1544638043067177, "learning_rate": 1.336968591711641e-06, - "loss": 0.1566, + "loss": 0.1574, "step": 5568 }, { "epoch": 0.7618331053351574, - "grad_norm": 1.2846735674174854, + "grad_norm": 1.2729509701843633, "learning_rate": 1.335506321705744e-06, - "loss": 0.1721, + "loss": 0.1719, "step": 5569 }, { "epoch": 0.761969904240766, - "grad_norm": 1.4007183843155484, + "grad_norm": 1.3841813078905665, "learning_rate": 1.3340447285288521e-06, - "loss": 0.1951, + "loss": 0.1955, "step": 5570 }, { "epoch": 0.7621067031463749, - "grad_norm": 1.2140629962836003, + "grad_norm": 1.2061636040125143, "learning_rate": 1.3325838124509217e-06, - "loss": 0.1759, + "loss": 0.1757, "step": 5571 }, { "epoch": 0.7622435020519835, - "grad_norm": 1.3420544396528997, + "grad_norm": 1.2374588816197776, "learning_rate": 1.3311235737417793e-06, - "loss": 0.1846, + "loss": 0.1759, "step": 5572 }, { "epoch": 0.7623803009575923, - "grad_norm": 1.2561170424592767, + "grad_norm": 1.2409602579928132, "learning_rate": 1.3296640126711318e-06, "loss": 0.158, "step": 5573 }, { "epoch": 0.762517099863201, - "grad_norm": 1.3062352492352847, + "grad_norm": 1.284702766927063, "learning_rate": 1.3282051295085602e-06, - "loss": 0.1703, + "loss": 0.1685, "step": 5574 }, { "epoch": 0.7626538987688098, - "grad_norm": 1.1113308079328053, + "grad_norm": 1.1138019450577907, "learning_rate": 1.326746924523516e-06, - "loss": 0.1704, + "loss": 0.1721, "step": 5575 }, { "epoch": 0.7627906976744186, - "grad_norm": 1.2902468858471186, + "grad_norm": 1.2873217371879873, "learning_rate": 1.3252893979853304e-06, - "loss": 0.2, + "loss": 0.1996, "step": 5576 }, { "epoch": 0.7629274965800273, - "grad_norm": 1.1046157816573623, + "grad_norm": 1.0660334334736883, "learning_rate": 1.3238325501632083e-06, - "loss": 0.1623, + "loss": 0.1603, "step": 5577 }, { "epoch": 0.7630642954856361, - "grad_norm": 1.173373905555827, + "grad_norm": 1.1566936258216503, "learning_rate": 1.3223763813262252e-06, - "loss": 0.1761, + "loss": 0.1749, "step": 5578 }, { "epoch": 0.7632010943912448, - "grad_norm": 1.2357367688218002, + "grad_norm": 1.2320698907270098, "learning_rate": 1.3209208917433381e-06, - "loss": 0.1881, + "loss": 0.1878, "step": 5579 }, { "epoch": 0.7633378932968536, - "grad_norm": 1.1958164552447887, + "grad_norm": 1.1793180535035623, "learning_rate": 1.319466081683371e-06, - "loss": 0.1597, + "loss": 0.1584, "step": 5580 }, { "epoch": 0.7634746922024623, - "grad_norm": 1.3182619396101667, + "grad_norm": 1.3084463898820013, "learning_rate": 1.3180119514150292e-06, - "loss": 0.1823, + "loss": 0.1803, "step": 5581 }, { "epoch": 0.7636114911080711, - "grad_norm": 1.4743844499090324, + "grad_norm": 1.4544040272351575, "learning_rate": 1.316558501206887e-06, - "loss": 0.1923, + "loss": 0.1939, "step": 5582 }, { "epoch": 0.7637482900136799, - "grad_norm": 1.281936894783123, + "grad_norm": 1.268870815882105, "learning_rate": 1.3151057313273963e-06, - "loss": 0.1888, + "loss": 0.1879, "step": 5583 }, { "epoch": 0.7638850889192886, - "grad_norm": 1.048679410269113, + "grad_norm": 1.0401079238092787, "learning_rate": 1.3136536420448843e-06, - "loss": 0.1466, + "loss": 0.1465, "step": 5584 }, { "epoch": 0.7640218878248974, - "grad_norm": 1.3421357348922445, + "grad_norm": 1.3400163033277823, "learning_rate": 1.3122022336275475e-06, - "loss": 0.1958, + "loss": 0.1963, "step": 5585 }, { "epoch": 0.7641586867305061, - "grad_norm": 1.4767442672458817, + "grad_norm": 1.4614849237958938, "learning_rate": 1.3107515063434618e-06, - "loss": 0.2103, + "loss": 0.2114, "step": 5586 }, { "epoch": 0.7642954856361149, - "grad_norm": 1.0171715210548815, + "grad_norm": 1.0026995740651328, "learning_rate": 1.3093014604605753e-06, "loss": 0.1636, "step": 5587 }, { "epoch": 0.7644322845417236, - "grad_norm": 1.2134152351840082, + "grad_norm": 1.1962569167337964, "learning_rate": 1.3078520962467111e-06, - "loss": 0.1595, + "loss": 0.159, "step": 5588 }, { "epoch": 0.7645690834473324, - "grad_norm": 1.1217024634344264, + "grad_norm": 1.128162379327805, "learning_rate": 1.3064034139695636e-06, - "loss": 0.1712, + "loss": 0.1722, "step": 5589 }, { "epoch": 0.7647058823529411, - "grad_norm": 1.060136709646856, + "grad_norm": 1.0494890545546158, "learning_rate": 1.3049554138967052e-06, - "loss": 0.1462, + "loss": 0.1477, "step": 5590 }, { "epoch": 0.7648426812585499, - "grad_norm": 1.2444490653772395, + "grad_norm": 1.2346702292566658, "learning_rate": 1.3035080962955781e-06, - "loss": 0.1566, + "loss": 0.1546, "step": 5591 }, { "epoch": 0.7649794801641587, - "grad_norm": 1.2413842175290102, + "grad_norm": 1.2216258456259375, "learning_rate": 1.302061461433502e-06, - "loss": 0.1798, + "loss": 0.1804, "step": 5592 }, { "epoch": 0.7651162790697674, - "grad_norm": 1.307315470287024, + "grad_norm": 1.2993072364448552, "learning_rate": 1.3006155095776707e-06, - "loss": 0.1811, + "loss": 0.1824, "step": 5593 }, { "epoch": 0.7652530779753762, - "grad_norm": 1.2015011643059146, + "grad_norm": 1.2052846905457695, "learning_rate": 1.2991702409951469e-06, - "loss": 0.1829, + "loss": 0.1841, "step": 5594 }, { "epoch": 0.7653898768809849, - "grad_norm": 1.2139393253620694, + "grad_norm": 1.2046853276767862, "learning_rate": 1.2977256559528738e-06, - "loss": 0.1804, + "loss": 0.1805, "step": 5595 }, { "epoch": 0.7655266757865937, - "grad_norm": 1.24780570320481, + "grad_norm": 1.2145571718620778, "learning_rate": 1.2962817547176625e-06, - "loss": 0.1753, + "loss": 0.1747, "step": 5596 }, { "epoch": 0.7656634746922024, - "grad_norm": 1.2717601838047574, + "grad_norm": 1.2278855498370125, "learning_rate": 1.2948385375562033e-06, - "loss": 0.2121, + "loss": 0.2089, "step": 5597 }, { "epoch": 0.7658002735978112, - "grad_norm": 1.193602188888458, + "grad_norm": 1.2065069007629414, "learning_rate": 1.2933960047350536e-06, - "loss": 0.1861, + "loss": 0.1889, "step": 5598 }, { "epoch": 0.76593707250342, - "grad_norm": 1.4890215790618642, + "grad_norm": 1.4802405599883726, "learning_rate": 1.29195415652065e-06, - "loss": 0.2266, + "loss": 0.2275, "step": 5599 }, { "epoch": 0.7660738714090287, - "grad_norm": 1.2033786623423037, + "grad_norm": 1.1939766207916858, "learning_rate": 1.290512993179301e-06, - "loss": 0.1733, + "loss": 0.173, "step": 5600 }, { "epoch": 0.7660738714090287, - "eval_loss": 0.1734624207019806, - "eval_runtime": 5.9092, - "eval_samples_per_second": 5.077, - "eval_steps_per_second": 1.354, + "eval_loss": 0.17376884818077087, + "eval_runtime": 5.9181, + "eval_samples_per_second": 5.069, + "eval_steps_per_second": 1.352, "step": 5600 }, { "epoch": 0.7662106703146375, - "grad_norm": 1.2193659096108838, + "grad_norm": 1.2114916786011032, "learning_rate": 1.2890725149771888e-06, - "loss": 0.1777, + "loss": 0.179, "step": 5601 }, { "epoch": 0.7663474692202462, - "grad_norm": 1.5682410075330868, + "grad_norm": 1.5624914436115935, "learning_rate": 1.2876327221803664e-06, - "loss": 0.2241, + "loss": 0.2253, "step": 5602 }, { "epoch": 0.766484268125855, - "grad_norm": 1.1990823321464277, + "grad_norm": 1.1975108552072058, "learning_rate": 1.2861936150547637e-06, - "loss": 0.1553, + "loss": 0.1563, "step": 5603 }, { "epoch": 0.7666210670314637, - "grad_norm": 1.163558653822356, + "grad_norm": 1.1469034302243917, "learning_rate": 1.2847551938661839e-06, - "loss": 0.1746, + "loss": 0.1745, "step": 5604 }, { "epoch": 0.7667578659370725, - "grad_norm": 1.0326586168594696, + "grad_norm": 1.027352473289547, "learning_rate": 1.2833174588802998e-06, "loss": 0.1665, "step": 5605 }, { "epoch": 0.7668946648426812, - "grad_norm": 1.4651827949527123, + "grad_norm": 1.4326593990288317, "learning_rate": 1.2818804103626625e-06, - "loss": 0.2049, + "loss": 0.201, "step": 5606 }, { "epoch": 0.76703146374829, - "grad_norm": 1.333856015458948, + "grad_norm": 1.3220348950436092, "learning_rate": 1.2804440485786901e-06, - "loss": 0.1765, + "loss": 0.1758, "step": 5607 }, { "epoch": 0.7671682626538988, - "grad_norm": 1.2839986418155922, + "grad_norm": 1.2811498517921096, "learning_rate": 1.2790083737936798e-06, - "loss": 0.1711, + "loss": 0.17, "step": 5608 }, { "epoch": 0.7673050615595075, - "grad_norm": 1.1153259047380777, + "grad_norm": 1.0999293623600246, "learning_rate": 1.2775733862728008e-06, - "loss": 0.137, + "loss": 0.1359, "step": 5609 }, { "epoch": 0.7674418604651163, - "grad_norm": 1.3394455238354717, + "grad_norm": 1.3241022944238925, "learning_rate": 1.2761390862810907e-06, - "loss": 0.1978, + "loss": 0.198, "step": 5610 }, { "epoch": 0.767578659370725, - "grad_norm": 1.1602213182597116, + "grad_norm": 1.1381697613460546, "learning_rate": 1.2747054740834675e-06, - "loss": 0.18, + "loss": 0.1803, "step": 5611 }, { "epoch": 0.7677154582763338, - "grad_norm": 1.0302145836897076, + "grad_norm": 1.022809170234718, "learning_rate": 1.2732725499447147e-06, - "loss": 0.1735, + "loss": 0.1752, "step": 5612 }, { "epoch": 0.7678522571819425, - "grad_norm": 1.2401314782029018, + "grad_norm": 1.2415516417635457, "learning_rate": 1.271840314129495e-06, - "loss": 0.1925, + "loss": 0.1933, "step": 5613 }, { "epoch": 0.7679890560875513, - "grad_norm": 1.2540492735760562, + "grad_norm": 1.2726320786750978, "learning_rate": 1.270408766902338e-06, - "loss": 0.1745, + "loss": 0.1761, "step": 5614 }, { "epoch": 0.7681258549931601, - "grad_norm": 1.2956788951623512, + "grad_norm": 1.2893208313291953, "learning_rate": 1.2689779085276515e-06, - "loss": 0.1916, + "loss": 0.1917, "step": 5615 }, { "epoch": 0.7682626538987688, - "grad_norm": 1.3239732361711913, + "grad_norm": 1.3267555440206986, "learning_rate": 1.267547739269714e-06, - "loss": 0.1798, + "loss": 0.1795, "step": 5616 }, { "epoch": 0.7683994528043776, - "grad_norm": 1.1276489959236435, + "grad_norm": 1.127251938588134, "learning_rate": 1.2661182593926753e-06, - "loss": 0.1697, + "loss": 0.1707, "step": 5617 }, { "epoch": 0.7685362517099863, - "grad_norm": 1.31708587192414, + "grad_norm": 1.3211982014020491, "learning_rate": 1.264689469160562e-06, - "loss": 0.1563, + "loss": 0.155, "step": 5618 }, { "epoch": 0.7686730506155951, - "grad_norm": 1.3180442565829706, + "grad_norm": 1.3154643007895068, "learning_rate": 1.263261368837267e-06, - "loss": 0.1701, + "loss": 0.1712, "step": 5619 }, { "epoch": 0.7688098495212038, - "grad_norm": 1.2712882117935305, + "grad_norm": 1.2623249205930815, "learning_rate": 1.2618339586865624e-06, - "loss": 0.1621, + "loss": 0.1625, "step": 5620 }, { "epoch": 0.7689466484268126, - "grad_norm": 1.4570733695083142, + "grad_norm": 1.4569173801217428, "learning_rate": 1.2604072389720862e-06, - "loss": 0.2325, + "loss": 0.2333, "step": 5621 }, { "epoch": 0.7690834473324213, - "grad_norm": 1.4624932508619033, + "grad_norm": 1.4343542673366896, "learning_rate": 1.258981209957356e-06, - "loss": 0.2135, + "loss": 0.2111, "step": 5622 }, { "epoch": 0.7692202462380301, - "grad_norm": 1.4515029423345318, + "grad_norm": 1.447912499536399, "learning_rate": 1.2575558719057551e-06, - "loss": 0.1632, + "loss": 0.1642, "step": 5623 }, { "epoch": 0.7693570451436389, - "grad_norm": 1.4352284273098603, + "grad_norm": 1.4227988689044055, "learning_rate": 1.2561312250805435e-06, - "loss": 0.1722, + "loss": 0.1728, "step": 5624 }, { "epoch": 0.7694938440492476, - "grad_norm": 1.3372345876789007, + "grad_norm": 1.3186418520231693, "learning_rate": 1.2547072697448543e-06, - "loss": 0.1814, + "loss": 0.1811, "step": 5625 }, { "epoch": 0.7696306429548564, - "grad_norm": 1.2289333051428244, + "grad_norm": 1.2584921713486623, "learning_rate": 1.2532840061616869e-06, - "loss": 0.176, + "loss": 0.1762, "step": 5626 }, { "epoch": 0.7697674418604651, - "grad_norm": 1.199457884188229, + "grad_norm": 1.194599935931612, "learning_rate": 1.2518614345939212e-06, - "loss": 0.1599, + "loss": 0.1595, "step": 5627 }, { "epoch": 0.7699042407660739, - "grad_norm": 1.1454108896965816, + "grad_norm": 1.1401162391003192, "learning_rate": 1.2504395553043008e-06, - "loss": 0.1574, + "loss": 0.1586, "step": 5628 }, { "epoch": 0.7700410396716826, - "grad_norm": 1.2655279661031564, + "grad_norm": 1.2465236931541785, "learning_rate": 1.249018368555448e-06, - "loss": 0.1914, + "loss": 0.189, "step": 5629 }, { "epoch": 0.7701778385772914, - "grad_norm": 1.2890979177247286, + "grad_norm": 1.2684694452050147, "learning_rate": 1.2475978746098549e-06, "loss": 0.1756, "step": 5630 }, { "epoch": 0.7703146374829002, - "grad_norm": 1.3126514506632943, + "grad_norm": 1.2958455823567627, "learning_rate": 1.2461780737298851e-06, - "loss": 0.2036, + "loss": 0.2046, "step": 5631 }, { "epoch": 0.7704514363885089, - "grad_norm": 1.2267072518189865, + "grad_norm": 1.2309179923303932, "learning_rate": 1.2447589661777758e-06, - "loss": 0.173, + "loss": 0.1737, "step": 5632 }, { "epoch": 0.7705882352941177, - "grad_norm": 1.4376151320355672, + "grad_norm": 1.4261033074620455, "learning_rate": 1.2433405522156334e-06, - "loss": 0.2117, + "loss": 0.2126, "step": 5633 }, { "epoch": 0.7707250341997264, - "grad_norm": 1.3014813968983998, + "grad_norm": 1.3475193710201376, "learning_rate": 1.2419228321054395e-06, - "loss": 0.1908, + "loss": 0.1937, "step": 5634 }, { "epoch": 0.7708618331053352, - "grad_norm": 1.1860299039208642, + "grad_norm": 1.1616505303286524, "learning_rate": 1.2405058061090431e-06, - "loss": 0.1763, + "loss": 0.1762, "step": 5635 }, { "epoch": 0.7709986320109439, - "grad_norm": 1.1597485603463176, + "grad_norm": 1.1381429362452384, "learning_rate": 1.239089474488171e-06, - "loss": 0.1425, + "loss": 0.1423, "step": 5636 }, { "epoch": 0.7711354309165527, - "grad_norm": 1.4034670539033136, + "grad_norm": 1.3916881145711406, "learning_rate": 1.2376738375044156e-06, - "loss": 0.2078, + "loss": 0.2061, "step": 5637 }, { "epoch": 0.7712722298221614, - "grad_norm": 1.6559518362485401, + "grad_norm": 1.606409642555358, "learning_rate": 1.2362588954192468e-06, - "loss": 0.2408, + "loss": 0.244, "step": 5638 }, { "epoch": 0.7714090287277702, - "grad_norm": 1.0470391023912395, + "grad_norm": 1.148363798848283, "learning_rate": 1.2348446484939996e-06, - "loss": 0.1686, + "loss": 0.1697, "step": 5639 }, { "epoch": 0.771545827633379, - "grad_norm": 1.351187761377191, + "grad_norm": 1.329521367701714, "learning_rate": 1.2334310969898872e-06, - "loss": 0.1862, + "loss": 0.1841, "step": 5640 }, { "epoch": 0.7716826265389877, - "grad_norm": 1.2936283245955322, + "grad_norm": 1.2777703830402027, "learning_rate": 1.2320182411679914e-06, - "loss": 0.1797, + "loss": 0.1782, "step": 5641 }, { "epoch": 0.7718194254445965, - "grad_norm": 1.0850648855629421, + "grad_norm": 1.0770106112859665, "learning_rate": 1.2306060812892635e-06, - "loss": 0.1563, + "loss": 0.1558, "step": 5642 }, { "epoch": 0.7719562243502052, - "grad_norm": 1.111265951492563, + "grad_norm": 1.115573236634839, "learning_rate": 1.2291946176145307e-06, - "loss": 0.156, + "loss": 0.1559, "step": 5643 }, { "epoch": 0.772093023255814, - "grad_norm": 1.3652058412443353, + "grad_norm": 1.3525801628690886, "learning_rate": 1.227783850404487e-06, - "loss": 0.1988, + "loss": 0.1983, "step": 5644 }, { "epoch": 0.7722298221614227, - "grad_norm": 1.2984996632528025, + "grad_norm": 1.3055565874810793, "learning_rate": 1.2263737799197006e-06, - "loss": 0.1803, + "loss": 0.1825, "step": 5645 }, { "epoch": 0.7723666210670315, - "grad_norm": 1.3748117452850133, + "grad_norm": 1.3761805359727195, "learning_rate": 1.2249644064206107e-06, - "loss": 0.1802, + "loss": 0.1828, "step": 5646 }, { "epoch": 0.7725034199726403, - "grad_norm": 1.0697881686781856, + "grad_norm": 1.068336653530424, "learning_rate": 1.2235557301675276e-06, - "loss": 0.1499, + "loss": 0.1487, "step": 5647 }, { "epoch": 0.772640218878249, - "grad_norm": 1.5048133254766463, + "grad_norm": 1.490381058193766, "learning_rate": 1.2221477514206336e-06, - "loss": 0.2067, + "loss": 0.2072, "step": 5648 }, { "epoch": 0.7727770177838578, - "grad_norm": 1.1738436274265875, + "grad_norm": 1.1820364831296524, "learning_rate": 1.2207404704399795e-06, - "loss": 0.1771, + "loss": 0.1782, "step": 5649 }, { "epoch": 0.7729138166894665, - "grad_norm": 1.0052952605148897, + "grad_norm": 0.9908330919645505, "learning_rate": 1.2193338874854904e-06, - "loss": 0.1405, + "loss": 0.141, "step": 5650 }, { "epoch": 0.7730506155950753, - "grad_norm": 1.1808989597926975, + "grad_norm": 1.18200909680159, "learning_rate": 1.2179280028169588e-06, - "loss": 0.1691, + "loss": 0.1679, "step": 5651 }, { "epoch": 0.773187414500684, - "grad_norm": 1.314283941321332, + "grad_norm": 1.2910825761306957, "learning_rate": 1.216522816694053e-06, - "loss": 0.2128, + "loss": 0.215, "step": 5652 }, { "epoch": 0.7733242134062928, - "grad_norm": 1.243859733614085, + "grad_norm": 1.2323289641372352, "learning_rate": 1.2151183293763074e-06, - "loss": 0.186, + "loss": 0.1873, "step": 5653 }, { "epoch": 0.7734610123119015, - "grad_norm": 1.2591539823798041, + "grad_norm": 1.2531849592870865, "learning_rate": 1.2137145411231328e-06, - "loss": 0.1749, + "loss": 0.1751, "step": 5654 }, { "epoch": 0.7735978112175103, - "grad_norm": 1.4458198402261373, + "grad_norm": 1.4479368722858728, "learning_rate": 1.212311452193804e-06, - "loss": 0.2184, + "loss": 0.2188, "step": 5655 }, { "epoch": 0.7737346101231191, - "grad_norm": 1.1516837810706606, + "grad_norm": 1.153344745939682, "learning_rate": 1.2109090628474717e-06, - "loss": 0.1705, + "loss": 0.1741, "step": 5656 }, { "epoch": 0.7738714090287278, - "grad_norm": 1.2639967223745439, + "grad_norm": 1.2364420482053666, "learning_rate": 1.2095073733431589e-06, - "loss": 0.2036, + "loss": 0.2011, "step": 5657 }, { "epoch": 0.7740082079343366, - "grad_norm": 0.9866401496032646, + "grad_norm": 0.9892563773636147, "learning_rate": 1.2081063839397522e-06, - "loss": 0.1272, + "loss": 0.1278, "step": 5658 }, { "epoch": 0.7741450068399452, - "grad_norm": 1.1721569993695495, + "grad_norm": 1.1555644202287223, "learning_rate": 1.2067060948960153e-06, - "loss": 0.1777, + "loss": 0.1775, "step": 5659 }, { "epoch": 0.774281805745554, - "grad_norm": 1.2789085034182492, + "grad_norm": 1.2615620495382112, "learning_rate": 1.2053065064705804e-06, - "loss": 0.1648, + "loss": 0.1657, "step": 5660 }, { "epoch": 0.7744186046511627, - "grad_norm": 1.2064237252906174, + "grad_norm": 1.1656383229537195, "learning_rate": 1.2039076189219517e-06, - "loss": 0.1917, + "loss": 0.1874, "step": 5661 }, { "epoch": 0.7745554035567715, - "grad_norm": 1.25422248313288, + "grad_norm": 1.2472860477328913, "learning_rate": 1.2025094325084995e-06, - "loss": 0.1512, + "loss": 0.1514, "step": 5662 }, { "epoch": 0.7746922024623804, - "grad_norm": 1.0860548371811023, + "grad_norm": 1.0866913735897337, "learning_rate": 1.2011119474884698e-06, - "loss": 0.1663, + "loss": 0.1656, "step": 5663 }, { "epoch": 0.774829001367989, - "grad_norm": 1.0905130188488703, + "grad_norm": 1.077365099148018, "learning_rate": 1.1997151641199772e-06, - "loss": 0.1445, + "loss": 0.1435, "step": 5664 }, { "epoch": 0.7749658002735978, - "grad_norm": 1.0426636034011199, + "grad_norm": 1.0095161124242946, "learning_rate": 1.1983190826610052e-06, - "loss": 0.158, + "loss": 0.1552, "step": 5665 }, { "epoch": 0.7751025991792065, - "grad_norm": 1.4181712950828533, + "grad_norm": 1.4225618207929505, "learning_rate": 1.1969237033694104e-06, - "loss": 0.1846, + "loss": 0.1844, "step": 5666 }, { "epoch": 0.7752393980848153, - "grad_norm": 1.2023373559842945, + "grad_norm": 1.2227026930434766, "learning_rate": 1.1955290265029162e-06, - "loss": 0.1719, + "loss": 0.1726, "step": 5667 }, { "epoch": 0.775376196990424, - "grad_norm": 1.2899079647593943, + "grad_norm": 1.28566452998815, "learning_rate": 1.194135052319121e-06, - "loss": 0.19, + "loss": 0.1902, "step": 5668 }, { "epoch": 0.7755129958960328, - "grad_norm": 1.1486182842639119, + "grad_norm": 1.139051960078, "learning_rate": 1.192741781075487e-06, - "loss": 0.1965, + "loss": 0.1961, "step": 5669 }, { "epoch": 0.7756497948016415, - "grad_norm": 1.1153262503126211, + "grad_norm": 1.1067347920867276, "learning_rate": 1.1913492130293542e-06, - "loss": 0.133, + "loss": 0.1336, "step": 5670 }, { "epoch": 0.7757865937072503, - "grad_norm": 1.1717381270507925, + "grad_norm": 1.1641618790263877, "learning_rate": 1.189957348437925e-06, - "loss": 0.1525, + "loss": 0.1522, "step": 5671 }, { "epoch": 0.7759233926128591, - "grad_norm": 1.4089541250623931, + "grad_norm": 1.3793531368187129, "learning_rate": 1.1885661875582781e-06, - "loss": 0.1768, + "loss": 0.177, "step": 5672 }, { "epoch": 0.7760601915184678, - "grad_norm": 1.5342831034270967, + "grad_norm": 1.5306803603093007, "learning_rate": 1.1871757306473596e-06, - "loss": 0.1971, + "loss": 0.197, "step": 5673 }, { "epoch": 0.7761969904240766, - "grad_norm": 1.2584280768459224, + "grad_norm": 1.286950526449913, "learning_rate": 1.1857859779619863e-06, - "loss": 0.169, + "loss": 0.1732, "step": 5674 }, { "epoch": 0.7763337893296853, - "grad_norm": 1.2868257642883612, + "grad_norm": 1.2863709600395818, "learning_rate": 1.1843969297588427e-06, - "loss": 0.2132, + "loss": 0.214, "step": 5675 }, { "epoch": 0.7764705882352941, - "grad_norm": 1.1995094039049983, + "grad_norm": 1.200496189519223, "learning_rate": 1.1830085862944851e-06, - "loss": 0.1488, + "loss": 0.1501, "step": 5676 }, { "epoch": 0.7766073871409028, - "grad_norm": 1.2425736678115156, + "grad_norm": 1.2254426119849762, "learning_rate": 1.1816209478253416e-06, - "loss": 0.1557, + "loss": 0.1569, "step": 5677 }, { "epoch": 0.7767441860465116, - "grad_norm": 1.28876756274005, + "grad_norm": 1.2801722115609724, "learning_rate": 1.1802340146077045e-06, - "loss": 0.1835, + "loss": 0.1846, "step": 5678 }, { "epoch": 0.7768809849521204, - "grad_norm": 1.123001730506387, + "grad_norm": 1.1100196268240454, "learning_rate": 1.1788477868977416e-06, - "loss": 0.1915, + "loss": 0.1923, "step": 5679 }, { "epoch": 0.7770177838577291, - "grad_norm": 1.2843935243989366, + "grad_norm": 1.256462112547854, "learning_rate": 1.177462264951489e-06, - "loss": 0.1731, + "loss": 0.1735, "step": 5680 }, { "epoch": 0.7771545827633379, - "grad_norm": 1.1495648711554856, + "grad_norm": 1.1580041427866792, "learning_rate": 1.1760774490248482e-06, - "loss": 0.1857, + "loss": 0.1895, "step": 5681 }, { "epoch": 0.7772913816689466, - "grad_norm": 1.2204280847352666, + "grad_norm": 1.209592083323075, "learning_rate": 1.174693339373597e-06, "loss": 0.1775, "step": 5682 }, { "epoch": 0.7774281805745554, - "grad_norm": 1.2190630937780844, + "grad_norm": 1.2074411989659406, "learning_rate": 1.1733099362533762e-06, - "loss": 0.1643, + "loss": 0.166, "step": 5683 }, { "epoch": 0.7775649794801641, - "grad_norm": 1.3298795998386603, + "grad_norm": 1.3373604942141801, "learning_rate": 1.1719272399197024e-06, - "loss": 0.2009, + "loss": 0.2017, "step": 5684 }, { "epoch": 0.7777017783857729, - "grad_norm": 1.4623743923697734, + "grad_norm": 1.4010510551622775, "learning_rate": 1.1705452506279547e-06, - "loss": 0.2125, + "loss": 0.2092, "step": 5685 }, { "epoch": 0.7778385772913816, - "grad_norm": 1.408220522257209, + "grad_norm": 1.3823758614878296, "learning_rate": 1.1691639686333893e-06, - "loss": 0.1972, + "loss": 0.1976, "step": 5686 }, { "epoch": 0.7779753761969904, - "grad_norm": 1.3920704609538697, + "grad_norm": 1.4058755367423668, "learning_rate": 1.167783394191125e-06, - "loss": 0.1599, + "loss": 0.1619, "step": 5687 }, { "epoch": 0.7781121751025992, - "grad_norm": 1.2607727570092466, + "grad_norm": 1.1685615222293702, "learning_rate": 1.166403527556153e-06, - "loss": 0.1797, + "loss": 0.1777, "step": 5688 }, { "epoch": 0.7782489740082079, - "grad_norm": 1.2965424331060438, + "grad_norm": 1.2810590361299894, "learning_rate": 1.1650243689833351e-06, - "loss": 0.193, + "loss": 0.1921, "step": 5689 }, { "epoch": 0.7783857729138167, - "grad_norm": 1.3575564261045003, + "grad_norm": 1.355117116516516, "learning_rate": 1.1636459187273996e-06, - "loss": 0.1802, + "loss": 0.1819, "step": 5690 }, { "epoch": 0.7785225718194254, - "grad_norm": 1.1087208957081571, + "grad_norm": 1.0933524178581036, "learning_rate": 1.1622681770429473e-06, - "loss": 0.158, + "loss": 0.159, "step": 5691 }, { "epoch": 0.7786593707250342, - "grad_norm": 1.2984638355800462, + "grad_norm": 1.292606991347369, "learning_rate": 1.1608911441844427e-06, - "loss": 0.1981, + "loss": 0.1977, "step": 5692 }, { "epoch": 0.7787961696306429, - "grad_norm": 1.1472272927280076, + "grad_norm": 1.1442212418234046, "learning_rate": 1.1595148204062256e-06, - "loss": 0.1546, + "loss": 0.1552, "step": 5693 }, { "epoch": 0.7789329685362517, - "grad_norm": 1.0573703059405897, + "grad_norm": 1.0169843491682353, "learning_rate": 1.158139205962499e-06, - "loss": 0.1497, + "loss": 0.1494, "step": 5694 }, { "epoch": 0.7790697674418605, - "grad_norm": 1.487420308976388, + "grad_norm": 1.4717624553945063, "learning_rate": 1.1567643011073393e-06, - "loss": 0.2048, + "loss": 0.2072, "step": 5695 }, { "epoch": 0.7792065663474692, - "grad_norm": 1.2272698099680879, + "grad_norm": 1.221886580442853, "learning_rate": 1.155390106094692e-06, - "loss": 0.1528, + "loss": 0.1534, "step": 5696 }, { "epoch": 0.779343365253078, - "grad_norm": 1.2354265552180057, + "grad_norm": 1.2231988442440365, "learning_rate": 1.154016621178366e-06, - "loss": 0.1693, + "loss": 0.1692, "step": 5697 }, { "epoch": 0.7794801641586867, - "grad_norm": 1.1084590306142923, + "grad_norm": 1.1187236144013208, "learning_rate": 1.1526438466120472e-06, - "loss": 0.1402, + "loss": 0.1412, "step": 5698 }, { "epoch": 0.7796169630642955, - "grad_norm": 1.235761925076978, + "grad_norm": 1.2031591158931125, "learning_rate": 1.1512717826492815e-06, - "loss": 0.1797, + "loss": 0.18, "step": 5699 }, { "epoch": 0.7797537619699042, - "grad_norm": 1.1991105573691783, + "grad_norm": 1.1862602069488914, "learning_rate": 1.1499004295434919e-06, - "loss": 0.1405, + "loss": 0.1404, "step": 5700 }, { "epoch": 0.7797537619699042, - "eval_loss": 0.1736205816268921, - "eval_runtime": 5.924, - "eval_samples_per_second": 5.064, - "eval_steps_per_second": 1.35, + "eval_loss": 0.1741250604391098, + "eval_runtime": 5.9079, + "eval_samples_per_second": 5.078, + "eval_steps_per_second": 1.354, "step": 5700 }, { "epoch": 0.779890560875513, - "grad_norm": 1.2338681228808925, + "grad_norm": 1.2378184630599067, "learning_rate": 1.1485297875479628e-06, - "loss": 0.1683, + "loss": 0.1678, "step": 5701 }, { "epoch": 0.7800273597811217, - "grad_norm": 1.0143688628085898, + "grad_norm": 1.0083631668899173, "learning_rate": 1.1471598569158525e-06, - "loss": 0.1697, + "loss": 0.1704, "step": 5702 }, { "epoch": 0.7801641586867305, - "grad_norm": 1.3535127912507305, + "grad_norm": 1.3476488215437328, "learning_rate": 1.1457906379001865e-06, - "loss": 0.2133, + "loss": 0.2145, "step": 5703 }, { "epoch": 0.7803009575923393, - "grad_norm": 1.40023766835393, + "grad_norm": 1.38660761851928, "learning_rate": 1.1444221307538572e-06, - "loss": 0.1919, + "loss": 0.1933, "step": 5704 }, { "epoch": 0.780437756497948, - "grad_norm": 1.1041502969710693, + "grad_norm": 1.1096307825427536, "learning_rate": 1.1430543357296286e-06, - "loss": 0.157, + "loss": 0.1569, "step": 5705 }, { "epoch": 0.7805745554035568, - "grad_norm": 1.0731300675646658, + "grad_norm": 1.0620131270944666, "learning_rate": 1.1416872530801294e-06, "loss": 0.1659, "step": 5706 }, { "epoch": 0.7807113543091655, - "grad_norm": 1.2331426312461387, + "grad_norm": 1.2321063231452405, "learning_rate": 1.1403208830578606e-06, - "loss": 0.1588, + "loss": 0.1609, "step": 5707 }, { "epoch": 0.7808481532147743, - "grad_norm": 1.095987993246331, + "grad_norm": 1.0920019406584986, "learning_rate": 1.1389552259151864e-06, - "loss": 0.167, + "loss": 0.1682, "step": 5708 }, { "epoch": 0.780984952120383, - "grad_norm": 1.2323585839676277, + "grad_norm": 1.219610004426305, "learning_rate": 1.1375902819043467e-06, - "loss": 0.1714, + "loss": 0.1698, "step": 5709 }, { "epoch": 0.7811217510259918, - "grad_norm": 1.2785539711947622, + "grad_norm": 1.280212580755052, "learning_rate": 1.1362260512774414e-06, - "loss": 0.1734, + "loss": 0.1711, "step": 5710 }, { "epoch": 0.7812585499316006, - "grad_norm": 1.4061951043866212, + "grad_norm": 1.3855758287904185, "learning_rate": 1.1348625342864455e-06, - "loss": 0.2043, + "loss": 0.2021, "step": 5711 }, { "epoch": 0.7813953488372093, - "grad_norm": 1.0405205222009524, + "grad_norm": 1.0317651902999274, "learning_rate": 1.1334997311832003e-06, - "loss": 0.1684, + "loss": 0.1678, "step": 5712 }, { "epoch": 0.7815321477428181, - "grad_norm": 1.1668320058865695, + "grad_norm": 1.144932685328499, "learning_rate": 1.1321376422194109e-06, - "loss": 0.188, + "loss": 0.1875, "step": 5713 }, { "epoch": 0.7816689466484268, - "grad_norm": 1.052993805320677, + "grad_norm": 1.0652488943815652, "learning_rate": 1.1307762676466578e-06, - "loss": 0.1598, + "loss": 0.1612, "step": 5714 }, { "epoch": 0.7818057455540356, - "grad_norm": 1.1404156695093692, + "grad_norm": 1.1351058352504415, "learning_rate": 1.1294156077163827e-06, - "loss": 0.1495, + "loss": 0.151, "step": 5715 }, { "epoch": 0.7819425444596443, - "grad_norm": 1.1580484919565548, + "grad_norm": 1.1501917601432596, "learning_rate": 1.1280556626799006e-06, - "loss": 0.1519, + "loss": 0.1516, "step": 5716 }, { "epoch": 0.7820793433652531, - "grad_norm": 1.147004838650765, + "grad_norm": 1.1194373627573502, "learning_rate": 1.1266964327883907e-06, - "loss": 0.1494, + "loss": 0.1474, "step": 5717 }, { "epoch": 0.7822161422708618, - "grad_norm": 1.1684261364026283, + "grad_norm": 1.1729065617248025, "learning_rate": 1.1253379182929014e-06, - "loss": 0.1844, + "loss": 0.1854, "step": 5718 }, { "epoch": 0.7823529411764706, - "grad_norm": 1.4311470679233123, + "grad_norm": 1.2705704777687696, "learning_rate": 1.1239801194443507e-06, - "loss": 0.1637, + "loss": 0.165, "step": 5719 }, { "epoch": 0.7824897400820794, - "grad_norm": 1.2997717561978275, + "grad_norm": 1.2855074839256448, "learning_rate": 1.1226230364935225e-06, - "loss": 0.1992, + "loss": 0.1998, "step": 5720 }, { "epoch": 0.7826265389876881, - "grad_norm": 1.0309491586985517, + "grad_norm": 1.0320090412510774, "learning_rate": 1.1212666696910701e-06, - "loss": 0.1606, + "loss": 0.162, "step": 5721 }, { "epoch": 0.7827633378932969, - "grad_norm": 1.3495884315136086, + "grad_norm": 1.35762458771863, "learning_rate": 1.1199110192875106e-06, - "loss": 0.1745, + "loss": 0.1729, "step": 5722 }, { "epoch": 0.7829001367989056, - "grad_norm": 1.3744557862185052, + "grad_norm": 1.3513431266608882, "learning_rate": 1.1185560855332346e-06, - "loss": 0.2084, + "loss": 0.2066, "step": 5723 }, { "epoch": 0.7830369357045144, - "grad_norm": 1.3596973540765753, + "grad_norm": 1.3224051991362253, "learning_rate": 1.1172018686784936e-06, - "loss": 0.1967, + "loss": 0.1951, "step": 5724 }, { "epoch": 0.7831737346101231, - "grad_norm": 1.2129346947127997, + "grad_norm": 1.231900566002706, "learning_rate": 1.1158483689734146e-06, - "loss": 0.1962, + "loss": 0.1961, "step": 5725 }, { "epoch": 0.7833105335157319, - "grad_norm": 1.4270672329437246, + "grad_norm": 1.3831355525377773, "learning_rate": 1.1144955866679835e-06, - "loss": 0.1882, + "loss": 0.1896, "step": 5726 }, { "epoch": 0.7834473324213407, - "grad_norm": 1.191637043007029, + "grad_norm": 1.1845255670347916, "learning_rate": 1.1131435220120606e-06, - "loss": 0.1563, + "loss": 0.1559, "step": 5727 }, { "epoch": 0.7835841313269494, - "grad_norm": 1.0353782473290838, + "grad_norm": 1.0285063686243623, "learning_rate": 1.1117921752553724e-06, - "loss": 0.1552, + "loss": 0.1575, "step": 5728 }, { "epoch": 0.7837209302325582, - "grad_norm": 1.3675362405823908, + "grad_norm": 1.3582469672493072, "learning_rate": 1.1104415466475088e-06, - "loss": 0.1747, + "loss": 0.175, "step": 5729 }, { "epoch": 0.7838577291381669, - "grad_norm": 1.185437120497967, + "grad_norm": 1.1792162676754645, "learning_rate": 1.1090916364379317e-06, - "loss": 0.1621, + "loss": 0.1643, "step": 5730 }, { "epoch": 0.7839945280437757, - "grad_norm": 1.188327194415033, + "grad_norm": 1.1846906245761764, "learning_rate": 1.1077424448759666e-06, - "loss": 0.1848, + "loss": 0.1855, "step": 5731 }, { "epoch": 0.7841313269493844, - "grad_norm": 1.432888007491133, + "grad_norm": 1.3977423574070602, "learning_rate": 1.1063939722108092e-06, - "loss": 0.1855, + "loss": 0.1858, "step": 5732 }, { "epoch": 0.7842681258549932, - "grad_norm": 1.2047812751616085, + "grad_norm": 1.1960283047765088, "learning_rate": 1.1050462186915217e-06, - "loss": 0.1764, + "loss": 0.1766, "step": 5733 }, { "epoch": 0.7844049247606019, - "grad_norm": 1.4615029656696117, + "grad_norm": 1.4564511772213267, "learning_rate": 1.1036991845670336e-06, - "loss": 0.2015, + "loss": 0.2016, "step": 5734 }, { "epoch": 0.7845417236662107, - "grad_norm": 1.4869990834508862, + "grad_norm": 1.4735845254191688, "learning_rate": 1.1023528700861385e-06, - "loss": 0.2081, + "loss": 0.2077, "step": 5735 }, { "epoch": 0.7846785225718195, - "grad_norm": 1.5648155197386728, + "grad_norm": 1.5285969193007347, "learning_rate": 1.1010072754975016e-06, - "loss": 0.2678, + "loss": 0.2658, "step": 5736 }, { "epoch": 0.7848153214774282, - "grad_norm": 1.0177700444225029, + "grad_norm": 1.0148492254791608, "learning_rate": 1.0996624010496536e-06, - "loss": 0.1545, + "loss": 0.1563, "step": 5737 }, { "epoch": 0.784952120383037, - "grad_norm": 1.0225432738556857, + "grad_norm": 1.013856787462601, "learning_rate": 1.0983182469909898e-06, - "loss": 0.1495, + "loss": 0.1503, "step": 5738 }, { "epoch": 0.7850889192886457, - "grad_norm": 1.3915763357158284, + "grad_norm": 1.3688404833420538, "learning_rate": 1.0969748135697767e-06, - "loss": 0.2002, + "loss": 0.1974, "step": 5739 }, { "epoch": 0.7852257181942545, - "grad_norm": 1.2759163496111012, + "grad_norm": 1.261755825942336, "learning_rate": 1.095632101034143e-06, - "loss": 0.2057, + "loss": 0.2085, "step": 5740 }, { "epoch": 0.7853625170998632, - "grad_norm": 1.2034936858650673, + "grad_norm": 1.2025112240701643, "learning_rate": 1.0942901096320884e-06, - "loss": 0.1878, + "loss": 0.1879, "step": 5741 }, { "epoch": 0.785499316005472, - "grad_norm": 1.5115362058503699, + "grad_norm": 1.5156555549815587, "learning_rate": 1.0929488396114756e-06, - "loss": 0.179, + "loss": 0.1783, "step": 5742 }, { "epoch": 0.7856361149110808, - "grad_norm": 1.2978224550327584, + "grad_norm": 1.298516418188128, "learning_rate": 1.0916082912200377e-06, - "loss": 0.1589, + "loss": 0.1594, "step": 5743 }, { "epoch": 0.7857729138166895, - "grad_norm": 1.2340326256400604, + "grad_norm": 1.223104140118789, "learning_rate": 1.090268464705374e-06, - "loss": 0.1668, + "loss": 0.1662, "step": 5744 }, { "epoch": 0.7859097127222983, - "grad_norm": 1.2762834897925839, + "grad_norm": 1.2980136782215668, "learning_rate": 1.088929360314946e-06, "loss": 0.1668, "step": 5745 }, { "epoch": 0.786046511627907, - "grad_norm": 1.6389637794987504, + "grad_norm": 1.6326775210266309, "learning_rate": 1.0875909782960887e-06, - "loss": 0.2279, + "loss": 0.2249, "step": 5746 }, { "epoch": 0.7861833105335158, - "grad_norm": 1.0712146226403632, + "grad_norm": 1.0518280304530079, "learning_rate": 1.0862533188959973e-06, - "loss": 0.1496, + "loss": 0.149, "step": 5747 }, { "epoch": 0.7863201094391244, - "grad_norm": 1.2139191601282822, + "grad_norm": 1.1744651460429167, "learning_rate": 1.0849163823617376e-06, - "loss": 0.1581, + "loss": 0.1564, "step": 5748 }, { "epoch": 0.7864569083447333, - "grad_norm": 1.4869509586865655, + "grad_norm": 1.4671576262720993, "learning_rate": 1.0835801689402408e-06, - "loss": 0.2266, + "loss": 0.227, "step": 5749 }, { "epoch": 0.786593707250342, - "grad_norm": 1.1042247109289896, + "grad_norm": 1.0896676157895024, "learning_rate": 1.0822446788783059e-06, - "loss": 0.1606, + "loss": 0.1596, "step": 5750 }, { "epoch": 0.7867305061559507, - "grad_norm": 1.3386121688480854, + "grad_norm": 1.3636682313743276, "learning_rate": 1.0809099124225941e-06, - "loss": 0.1842, + "loss": 0.1836, "step": 5751 }, { "epoch": 0.7868673050615596, - "grad_norm": 1.1169475685899908, + "grad_norm": 1.0723034064254806, "learning_rate": 1.0795758698196368e-06, - "loss": 0.1666, + "loss": 0.1656, "step": 5752 }, { "epoch": 0.7870041039671682, - "grad_norm": 1.3719760684305742, + "grad_norm": 1.350970911016713, "learning_rate": 1.0782425513158317e-06, - "loss": 0.1833, + "loss": 0.1837, "step": 5753 }, { "epoch": 0.787140902872777, - "grad_norm": 1.1668510180952694, + "grad_norm": 1.1593057514239216, "learning_rate": 1.07690995715744e-06, - "loss": 0.1674, + "loss": 0.1675, "step": 5754 }, { "epoch": 0.7872777017783857, - "grad_norm": 1.088906110120661, + "grad_norm": 1.060749419214049, "learning_rate": 1.0755780875905924e-06, - "loss": 0.1613, + "loss": 0.1607, "step": 5755 }, { "epoch": 0.7874145006839945, - "grad_norm": 1.0937529579571366, + "grad_norm": 1.07336118548735, "learning_rate": 1.0742469428612818e-06, - "loss": 0.1333, + "loss": 0.1329, "step": 5756 }, { "epoch": 0.7875512995896032, - "grad_norm": 1.2575546565992972, + "grad_norm": 1.2380916850628136, "learning_rate": 1.0729165232153721e-06, - "loss": 0.1969, + "loss": 0.1962, "step": 5757 }, { "epoch": 0.787688098495212, - "grad_norm": 1.1744592790355848, + "grad_norm": 1.159639540387967, "learning_rate": 1.0715868288985881e-06, - "loss": 0.1813, + "loss": 0.1805, "step": 5758 }, { "epoch": 0.7878248974008208, - "grad_norm": 1.146800053602285, + "grad_norm": 1.1352554458023383, "learning_rate": 1.070257860156525e-06, - "loss": 0.1655, + "loss": 0.1658, "step": 5759 }, { "epoch": 0.7879616963064295, - "grad_norm": 1.1805849618074093, + "grad_norm": 1.1921850649797976, "learning_rate": 1.068929617234643e-06, - "loss": 0.1806, + "loss": 0.1834, "step": 5760 }, { "epoch": 0.7880984952120383, - "grad_norm": 1.0472640528458301, + "grad_norm": 1.0299323689349433, "learning_rate": 1.067602100378265e-06, - "loss": 0.1703, + "loss": 0.1698, "step": 5761 }, { "epoch": 0.788235294117647, - "grad_norm": 1.0807439973204969, + "grad_norm": 1.0674427456020874, "learning_rate": 1.066275309832584e-06, - "loss": 0.15, + "loss": 0.1506, "step": 5762 }, { "epoch": 0.7883720930232558, - "grad_norm": 1.297863194872636, + "grad_norm": 1.284089623375108, "learning_rate": 1.0649492458426563e-06, - "loss": 0.1704, + "loss": 0.1703, "step": 5763 }, { "epoch": 0.7885088919288645, - "grad_norm": 1.1976559365867385, + "grad_norm": 1.195773934738301, "learning_rate": 1.0636239086534073e-06, - "loss": 0.181, + "loss": 0.1834, "step": 5764 }, { "epoch": 0.7886456908344733, - "grad_norm": 1.1028573691533912, + "grad_norm": 1.0448042220379552, "learning_rate": 1.062299298509622e-06, - "loss": 0.1649, + "loss": 0.1694, "step": 5765 }, { "epoch": 0.788782489740082, - "grad_norm": 1.062986030424303, + "grad_norm": 1.0633450243640286, "learning_rate": 1.0609754156559581e-06, - "loss": 0.1591, + "loss": 0.1617, "step": 5766 }, { "epoch": 0.7889192886456908, - "grad_norm": 1.500032269018467, + "grad_norm": 1.4611716359327183, "learning_rate": 1.059652260336933e-06, - "loss": 0.2207, + "loss": 0.2194, "step": 5767 }, { "epoch": 0.7890560875512996, - "grad_norm": 1.3055975762137757, + "grad_norm": 1.3079647273082304, "learning_rate": 1.0583298327969338e-06, - "loss": 0.1686, + "loss": 0.1702, "step": 5768 }, { "epoch": 0.7891928864569083, - "grad_norm": 1.5496363648581197, + "grad_norm": 1.5299594293120136, "learning_rate": 1.0570081332802124e-06, - "loss": 0.2316, + "loss": 0.234, "step": 5769 }, { "epoch": 0.7893296853625171, - "grad_norm": 1.2018997894222492, + "grad_norm": 1.2034070760572515, "learning_rate": 1.0556871620308834e-06, - "loss": 0.1852, + "loss": 0.1879, "step": 5770 }, { "epoch": 0.7894664842681258, - "grad_norm": 1.2782902369600198, + "grad_norm": 1.2553538987627193, "learning_rate": 1.0543669192929323e-06, - "loss": 0.2217, + "loss": 0.2222, "step": 5771 }, { "epoch": 0.7896032831737346, - "grad_norm": 1.0476705273864928, + "grad_norm": 1.04359158096775, "learning_rate": 1.0530474053102036e-06, - "loss": 0.1587, + "loss": 0.1581, "step": 5772 }, { "epoch": 0.7897400820793433, - "grad_norm": 1.447533769240206, + "grad_norm": 1.4425794202899562, "learning_rate": 1.0517286203264131e-06, - "loss": 0.173, + "loss": 0.1728, "step": 5773 }, { "epoch": 0.7898768809849521, - "grad_norm": 1.3066470467593578, + "grad_norm": 1.2655712740203766, "learning_rate": 1.050410564585137e-06, - "loss": 0.1796, + "loss": 0.1785, "step": 5774 }, { "epoch": 0.7900136798905609, - "grad_norm": 1.5013914968340356, + "grad_norm": 1.47889224707471, "learning_rate": 1.0490932383298203e-06, - "loss": 0.1737, + "loss": 0.1733, "step": 5775 }, { "epoch": 0.7901504787961696, - "grad_norm": 1.4932464777673256, + "grad_norm": 1.4887376378791475, "learning_rate": 1.0477766418037722e-06, - "loss": 0.2054, + "loss": 0.2047, "step": 5776 }, { "epoch": 0.7902872777017784, - "grad_norm": 1.022434258623563, + "grad_norm": 1.0082074385934434, "learning_rate": 1.0464607752501682e-06, - "loss": 0.1519, + "loss": 0.1527, "step": 5777 }, { "epoch": 0.7904240766073871, - "grad_norm": 1.1046544790230148, + "grad_norm": 1.0966050339768112, "learning_rate": 1.0451456389120445e-06, - "loss": 0.1453, + "loss": 0.1459, "step": 5778 }, { "epoch": 0.7905608755129959, - "grad_norm": 1.349657156670974, + "grad_norm": 1.348530047907173, "learning_rate": 1.0438312330323086e-06, - "loss": 0.1757, + "loss": 0.1771, "step": 5779 }, { "epoch": 0.7906976744186046, - "grad_norm": 1.3443649410801852, + "grad_norm": 1.3224057683275798, "learning_rate": 1.04251755785373e-06, - "loss": 0.1884, + "loss": 0.187, "step": 5780 }, { "epoch": 0.7908344733242134, - "grad_norm": 1.1174172800182147, + "grad_norm": 1.1229811236670282, "learning_rate": 1.0412046136189414e-06, - "loss": 0.137, + "loss": 0.135, "step": 5781 }, { "epoch": 0.7909712722298221, - "grad_norm": 1.4244837953454597, + "grad_norm": 1.4275231196341722, "learning_rate": 1.0398924005704453e-06, - "loss": 0.1863, + "loss": 0.1866, "step": 5782 }, { "epoch": 0.7911080711354309, - "grad_norm": 1.058756529360784, + "grad_norm": 1.030124399287971, "learning_rate": 1.038580918950604e-06, - "loss": 0.133, + "loss": 0.1311, "step": 5783 }, { "epoch": 0.7912448700410397, - "grad_norm": 1.284485122575856, + "grad_norm": 1.2705305067810828, "learning_rate": 1.0372701690016474e-06, - "loss": 0.1961, + "loss": 0.1974, "step": 5784 }, { "epoch": 0.7913816689466484, - "grad_norm": 1.2364812755905223, + "grad_norm": 1.2281418310996814, "learning_rate": 1.0359601509656724e-06, - "loss": 0.2023, + "loss": 0.1993, "step": 5785 }, { "epoch": 0.7915184678522572, - "grad_norm": 1.3407641090683644, + "grad_norm": 1.3396378524552202, "learning_rate": 1.0346508650846349e-06, - "loss": 0.1542, + "loss": 0.1543, "step": 5786 }, { "epoch": 0.7916552667578659, - "grad_norm": 1.275162970557446, + "grad_norm": 1.2520151107695603, "learning_rate": 1.0333423116003617e-06, - "loss": 0.1554, + "loss": 0.1542, "step": 5787 }, { "epoch": 0.7917920656634747, - "grad_norm": 1.2586822406423626, + "grad_norm": 1.253444217252233, "learning_rate": 1.0320344907545388e-06, - "loss": 0.1903, + "loss": 0.1913, "step": 5788 }, { "epoch": 0.7919288645690834, - "grad_norm": 0.9828403205865767, + "grad_norm": 0.9827006850095013, "learning_rate": 1.0307274027887232e-06, - "loss": 0.141, + "loss": 0.1411, "step": 5789 }, { "epoch": 0.7920656634746922, - "grad_norm": 1.385143213442881, + "grad_norm": 1.37134381675515, "learning_rate": 1.029421047944329e-06, - "loss": 0.2084, + "loss": 0.2065, "step": 5790 }, { "epoch": 0.792202462380301, - "grad_norm": 1.4537243882157478, + "grad_norm": 1.442818155960312, "learning_rate": 1.0281154264626414e-06, - "loss": 0.2267, + "loss": 0.2272, "step": 5791 }, { "epoch": 0.7923392612859097, - "grad_norm": 1.3452787483706818, + "grad_norm": 1.336158581906642, "learning_rate": 1.0268105385848065e-06, - "loss": 0.1858, + "loss": 0.1866, "step": 5792 }, { "epoch": 0.7924760601915185, - "grad_norm": 1.22868851254295, + "grad_norm": 1.2068842143531184, "learning_rate": 1.0255063845518371e-06, - "loss": 0.1695, + "loss": 0.1689, "step": 5793 }, { "epoch": 0.7926128590971272, - "grad_norm": 1.026901851268176, + "grad_norm": 1.0220675578419363, "learning_rate": 1.0242029646046098e-06, - "loss": 0.1564, + "loss": 0.1555, "step": 5794 }, { "epoch": 0.792749658002736, - "grad_norm": 1.1244887107906796, + "grad_norm": 1.119038192671314, "learning_rate": 1.0229002789838632e-06, - "loss": 0.1403, + "loss": 0.142, "step": 5795 }, { "epoch": 0.7928864569083447, - "grad_norm": 0.9913302818701589, + "grad_norm": 0.9852686054184366, "learning_rate": 1.0215983279302049e-06, - "loss": 0.1692, + "loss": 0.1696, "step": 5796 }, { "epoch": 0.7930232558139535, - "grad_norm": 1.1908302722075756, + "grad_norm": 1.1838388122888264, "learning_rate": 1.0202971116841009e-06, - "loss": 0.165, + "loss": 0.1644, "step": 5797 }, { "epoch": 0.7931600547195622, - "grad_norm": 1.300355465408109, + "grad_norm": 1.2874496856367805, "learning_rate": 1.0189966304858883e-06, - "loss": 0.1949, + "loss": 0.1964, "step": 5798 }, { "epoch": 0.793296853625171, - "grad_norm": 1.2109738589851504, + "grad_norm": 1.1850857757538438, "learning_rate": 1.0176968845757617e-06, - "loss": 0.1724, + "loss": 0.1717, "step": 5799 }, { "epoch": 0.7934336525307798, - "grad_norm": 1.248055133375562, + "grad_norm": 1.2030991408089828, "learning_rate": 1.0163978741937847e-06, - "loss": 0.1841, + "loss": 0.1819, "step": 5800 }, { "epoch": 0.7934336525307798, - "eval_loss": 0.1730540245771408, - "eval_runtime": 5.9105, + "eval_loss": 0.17350810766220093, + "eval_runtime": 5.9102, "eval_samples_per_second": 5.076, "eval_steps_per_second": 1.354, "step": 5800 }, { "epoch": 0.7935704514363885, - "grad_norm": 1.3027723456728093, + "grad_norm": 1.294462213558511, "learning_rate": 1.0150995995798846e-06, - "loss": 0.1529, + "loss": 0.1526, "step": 5801 }, { "epoch": 0.7937072503419973, - "grad_norm": 1.2883117092901049, + "grad_norm": 1.2977570734393855, "learning_rate": 1.0138020609738492e-06, - "loss": 0.1499, + "loss": 0.15, "step": 5802 }, { "epoch": 0.793844049247606, - "grad_norm": 1.2749606432066676, + "grad_norm": 1.256463622230523, "learning_rate": 1.012505258615335e-06, - "loss": 0.1679, + "loss": 0.1678, "step": 5803 }, { "epoch": 0.7939808481532148, - "grad_norm": 1.390233005579573, + "grad_norm": 1.375523404960598, "learning_rate": 1.0112091927438582e-06, - "loss": 0.1952, + "loss": 0.1946, "step": 5804 }, { "epoch": 0.7941176470588235, - "grad_norm": 1.3066520762337872, + "grad_norm": 1.3197010186532752, "learning_rate": 1.0099138635988026e-06, - "loss": 0.1654, + "loss": 0.1671, "step": 5805 }, { "epoch": 0.7942544459644323, - "grad_norm": 1.145956700173891, + "grad_norm": 1.1419049921615783, "learning_rate": 1.0086192714194144e-06, - "loss": 0.1769, + "loss": 0.1784, "step": 5806 }, { "epoch": 0.7943912448700411, - "grad_norm": 1.0654867304391076, + "grad_norm": 1.0617891126625467, "learning_rate": 1.0073254164448026e-06, - "loss": 0.1816, + "loss": 0.1801, "step": 5807 }, { "epoch": 0.7945280437756498, - "grad_norm": 1.2922950406563958, + "grad_norm": 1.2904873957177796, "learning_rate": 1.0060322989139444e-06, - "loss": 0.1904, + "loss": 0.189, "step": 5808 }, { "epoch": 0.7946648426812586, - "grad_norm": 1.251142792169125, + "grad_norm": 1.240106294381596, "learning_rate": 1.0047399190656732e-06, - "loss": 0.1757, + "loss": 0.1736, "step": 5809 }, { "epoch": 0.7948016415868673, - "grad_norm": 1.3497312492675937, + "grad_norm": 1.3256356131198008, "learning_rate": 1.003448277138694e-06, - "loss": 0.1815, + "loss": 0.181, "step": 5810 }, { "epoch": 0.7949384404924761, - "grad_norm": 1.0063404537248677, + "grad_norm": 1.0144677358624083, "learning_rate": 1.002157373371569e-06, - "loss": 0.1191, + "loss": 0.1195, "step": 5811 }, { "epoch": 0.7950752393980848, - "grad_norm": 1.2453379875417532, + "grad_norm": 1.23582113835388, "learning_rate": 1.0008672080027299e-06, - "loss": 0.2003, + "loss": 0.2005, "step": 5812 }, { "epoch": 0.7952120383036936, - "grad_norm": 1.209395760071714, + "grad_norm": 1.2120690140483776, "learning_rate": 9.995777812704666e-07, - "loss": 0.1484, + "loss": 0.1505, "step": 5813 }, { "epoch": 0.7953488372093023, - "grad_norm": 1.1724540154283873, + "grad_norm": 1.1663401848945816, "learning_rate": 9.98289093412938e-07, - "loss": 0.1584, + "loss": 0.1573, "step": 5814 }, { "epoch": 0.7954856361149111, - "grad_norm": 0.9613840948041026, + "grad_norm": 0.9620804789572337, "learning_rate": 9.97001144668161e-07, - "loss": 0.1446, + "loss": 0.1439, "step": 5815 }, { "epoch": 0.7956224350205199, - "grad_norm": 1.357491416672963, + "grad_norm": 1.3656890667555768, "learning_rate": 9.95713935274019e-07, - "loss": 0.1814, + "loss": 0.1802, "step": 5816 }, { "epoch": 0.7957592339261286, - "grad_norm": 1.2313920791511486, + "grad_norm": 1.2533775796321265, "learning_rate": 9.944274654682618e-07, - "loss": 0.1703, + "loss": 0.1721, "step": 5817 }, { "epoch": 0.7958960328317374, - "grad_norm": 1.226353276407476, + "grad_norm": 1.2333142748729553, "learning_rate": 9.931417354884949e-07, - "loss": 0.1868, + "loss": 0.1884, "step": 5818 }, { "epoch": 0.7960328317373461, - "grad_norm": 1.287294075878259, + "grad_norm": 1.2702440068913496, "learning_rate": 9.91856745572195e-07, - "loss": 0.1532, + "loss": 0.1536, "step": 5819 }, { "epoch": 0.7961696306429549, - "grad_norm": 1.450202138257752, + "grad_norm": 1.4326996320635697, "learning_rate": 9.90572495956696e-07, - "loss": 0.2077, + "loss": 0.2042, "step": 5820 }, { "epoch": 0.7963064295485636, - "grad_norm": 0.9921607135875667, + "grad_norm": 0.9902000228258169, "learning_rate": 9.892889868791993e-07, - "loss": 0.1538, + "loss": 0.1544, "step": 5821 }, { "epoch": 0.7964432284541724, - "grad_norm": 1.0646642933346928, + "grad_norm": 1.073847750217936, "learning_rate": 9.880062185767674e-07, - "loss": 0.1414, + "loss": 0.1429, "step": 5822 }, { "epoch": 0.7965800273597812, - "grad_norm": 0.9849092157041673, + "grad_norm": 0.9800180423312063, "learning_rate": 9.867241912863268e-07, - "loss": 0.1701, + "loss": 0.1693, "step": 5823 }, { "epoch": 0.7967168262653899, - "grad_norm": 1.3291367387707367, + "grad_norm": 1.293541558415902, "learning_rate": 9.854429052446684e-07, - "loss": 0.1905, + "loss": 0.1854, "step": 5824 }, { "epoch": 0.7968536251709987, - "grad_norm": 1.7283360865910815, + "grad_norm": 1.6440801737014434, "learning_rate": 9.84162360688442e-07, - "loss": 0.2403, + "loss": 0.2392, "step": 5825 }, { "epoch": 0.7969904240766074, - "grad_norm": 1.2010832256096151, + "grad_norm": 1.2018918432540997, "learning_rate": 9.828825578541661e-07, - "loss": 0.1726, + "loss": 0.1721, "step": 5826 }, { "epoch": 0.7971272229822162, - "grad_norm": 1.138665580852214, + "grad_norm": 1.1190426818389507, "learning_rate": 9.816034969782157e-07, - "loss": 0.1634, + "loss": 0.1639, "step": 5827 }, { "epoch": 0.7972640218878249, - "grad_norm": 1.3540450207443788, + "grad_norm": 1.359153156785524, "learning_rate": 9.80325178296836e-07, - "loss": 0.2126, + "loss": 0.2149, "step": 5828 }, { "epoch": 0.7974008207934337, - "grad_norm": 1.2760550665932189, + "grad_norm": 1.2622439834288193, "learning_rate": 9.790476020461277e-07, - "loss": 0.1871, + "loss": 0.1863, "step": 5829 }, { "epoch": 0.7975376196990424, - "grad_norm": 1.0298636423818506, + "grad_norm": 1.0043753720506785, "learning_rate": 9.777707684620614e-07, - "loss": 0.1412, + "loss": 0.139, "step": 5830 }, { "epoch": 0.7976744186046512, - "grad_norm": 1.1251436175561733, + "grad_norm": 1.1212821224629188, "learning_rate": 9.764946777804646e-07, - "loss": 0.1595, + "loss": 0.1617, "step": 5831 }, { "epoch": 0.79781121751026, - "grad_norm": 1.2859649023748279, + "grad_norm": 1.3013084935108201, "learning_rate": 9.752193302370316e-07, "loss": 0.2052, "step": 5832 }, { "epoch": 0.7979480164158687, - "grad_norm": 1.1715226678083768, + "grad_norm": 1.160709892314237, "learning_rate": 9.739447260673192e-07, - "loss": 0.1856, + "loss": 0.1863, "step": 5833 }, { "epoch": 0.7980848153214775, - "grad_norm": 1.1161942337192936, + "grad_norm": 1.1033196881325027, "learning_rate": 9.726708655067429e-07, - "loss": 0.1475, + "loss": 0.1463, "step": 5834 }, { "epoch": 0.7982216142270862, - "grad_norm": 1.17414721781631, + "grad_norm": 1.1691439647951591, "learning_rate": 9.713977487905852e-07, - "loss": 0.1572, + "loss": 0.1585, "step": 5835 }, { "epoch": 0.798358413132695, - "grad_norm": 1.1693273184765158, + "grad_norm": 1.1656906156139464, "learning_rate": 9.701253761539897e-07, - "loss": 0.1583, + "loss": 0.159, "step": 5836 }, { "epoch": 0.7984952120383036, - "grad_norm": 1.2495621476769843, + "grad_norm": 1.2378668197095557, "learning_rate": 9.688537478319642e-07, - "loss": 0.1801, + "loss": 0.179, "step": 5837 }, { "epoch": 0.7986320109439125, - "grad_norm": 1.2017668677754838, + "grad_norm": 1.1850768603892492, "learning_rate": 9.675828640593742e-07, - "loss": 0.1819, + "loss": 0.182, "step": 5838 }, { "epoch": 0.7987688098495213, - "grad_norm": 1.0849651352335439, + "grad_norm": 1.0816960419669, "learning_rate": 9.66312725070953e-07, - "loss": 0.1772, + "loss": 0.179, "step": 5839 }, { "epoch": 0.79890560875513, - "grad_norm": 1.340751513527037, + "grad_norm": 1.3309836819893128, "learning_rate": 9.650433311012946e-07, - "loss": 0.1983, + "loss": 0.1985, "step": 5840 }, { "epoch": 0.7990424076607388, - "grad_norm": 1.2388798749193128, + "grad_norm": 1.2376871452865617, "learning_rate": 9.637746823848537e-07, - "loss": 0.1828, + "loss": 0.1838, "step": 5841 }, { "epoch": 0.7991792065663474, - "grad_norm": 1.16884781794116, + "grad_norm": 1.1594594104756113, "learning_rate": 9.6250677915595e-07, - "loss": 0.1469, + "loss": 0.1464, "step": 5842 }, { "epoch": 0.7993160054719562, - "grad_norm": 1.2193508582611268, + "grad_norm": 1.2080419805323555, "learning_rate": 9.612396216487625e-07, - "loss": 0.2007, + "loss": 0.2017, "step": 5843 }, { "epoch": 0.7994528043775649, - "grad_norm": 1.1671703960439777, + "grad_norm": 1.1458189771462315, "learning_rate": 9.599732100973359e-07, - "loss": 0.1665, + "loss": 0.1654, "step": 5844 }, { "epoch": 0.7995896032831737, - "grad_norm": 1.0548453980633785, + "grad_norm": 1.0540496009179317, "learning_rate": 9.587075447355732e-07, - "loss": 0.1568, + "loss": 0.1565, "step": 5845 }, { "epoch": 0.7997264021887824, - "grad_norm": 1.351836525465363, + "grad_norm": 1.338871341512896, "learning_rate": 9.574426257972446e-07, - "loss": 0.188, + "loss": 0.1898, "step": 5846 }, { "epoch": 0.7998632010943912, - "grad_norm": 1.1221280413038917, + "grad_norm": 1.1130073569886527, "learning_rate": 9.561784535159775e-07, - "loss": 0.1616, + "loss": 0.1622, "step": 5847 }, { "epoch": 0.8, - "grad_norm": 1.230215119339713, + "grad_norm": 1.2326993625195712, "learning_rate": 9.549150281252633e-07, - "loss": 0.1827, + "loss": 0.1834, "step": 5848 }, { "epoch": 0.8001367989056087, - "grad_norm": 1.2156500870374651, + "grad_norm": 1.2071095224485986, "learning_rate": 9.536523498584566e-07, - "loss": 0.1655, + "loss": 0.166, "step": 5849 }, { "epoch": 0.8002735978112175, - "grad_norm": 1.243285982396463, + "grad_norm": 1.2452665982737983, "learning_rate": 9.523904189487743e-07, - "loss": 0.1669, + "loss": 0.168, "step": 5850 }, { "epoch": 0.8004103967168262, - "grad_norm": 1.2645293450653052, + "grad_norm": 1.2720090274829676, "learning_rate": 9.511292356292917e-07, - "loss": 0.1898, + "loss": 0.1914, "step": 5851 }, { "epoch": 0.800547195622435, - "grad_norm": 0.8148030153802125, + "grad_norm": 0.7923465107012779, "learning_rate": 9.498688001329487e-07, - "loss": 0.1409, + "loss": 0.1394, "step": 5852 }, { "epoch": 0.8006839945280437, - "grad_norm": 1.1566286533490908, + "grad_norm": 1.1447510466484807, "learning_rate": 9.48609112692549e-07, - "loss": 0.1704, + "loss": 0.1692, "step": 5853 }, { "epoch": 0.8008207934336525, - "grad_norm": 1.565642890817469, + "grad_norm": 1.4516269380763516, "learning_rate": 9.473501735407525e-07, - "loss": 0.2278, + "loss": 0.228, "step": 5854 }, { "epoch": 0.8009575923392613, - "grad_norm": 1.2690107980840117, + "grad_norm": 1.2516600762219496, "learning_rate": 9.460919829100861e-07, - "loss": 0.1904, + "loss": 0.1907, "step": 5855 }, { "epoch": 0.80109439124487, - "grad_norm": 1.2767640882332938, + "grad_norm": 1.27593534465952, "learning_rate": 9.448345410329379e-07, - "loss": 0.1718, + "loss": 0.1695, "step": 5856 }, { "epoch": 0.8012311901504788, - "grad_norm": 1.3224409810350617, + "grad_norm": 1.2854436578329425, "learning_rate": 9.43577848141553e-07, - "loss": 0.1835, + "loss": 0.1801, "step": 5857 }, { "epoch": 0.8013679890560875, - "grad_norm": 1.4535875535879086, + "grad_norm": 1.433930240629195, "learning_rate": 9.423219044680448e-07, - "loss": 0.1877, + "loss": 0.1861, "step": 5858 }, { "epoch": 0.8015047879616963, - "grad_norm": 1.3963539380099412, + "grad_norm": 1.3880331208941048, "learning_rate": 9.410667102443827e-07, - "loss": 0.2029, + "loss": 0.2039, "step": 5859 }, { "epoch": 0.801641586867305, - "grad_norm": 1.37513022052378, + "grad_norm": 1.4201268590859277, "learning_rate": 9.398122657024023e-07, - "loss": 0.1568, + "loss": 0.1588, "step": 5860 }, { "epoch": 0.8017783857729138, - "grad_norm": 1.2048794754155578, + "grad_norm": 1.1959619405912945, "learning_rate": 9.38558571073796e-07, - "loss": 0.1793, + "loss": 0.1786, "step": 5861 }, { "epoch": 0.8019151846785225, - "grad_norm": 1.2625641391536353, + "grad_norm": 1.306376793972727, "learning_rate": 9.37305626590122e-07, - "loss": 0.1987, + "loss": 0.199, "step": 5862 }, { "epoch": 0.8020519835841313, - "grad_norm": 1.5119626995829463, + "grad_norm": 1.4857643436629162, "learning_rate": 9.360534324827969e-07, - "loss": 0.2617, + "loss": 0.2636, "step": 5863 }, { "epoch": 0.8021887824897401, - "grad_norm": 1.3472463594637551, + "grad_norm": 1.3423096190010335, "learning_rate": 9.348019889831006e-07, - "loss": 0.186, + "loss": 0.1854, "step": 5864 }, { "epoch": 0.8023255813953488, - "grad_norm": 1.3310108204443474, + "grad_norm": 1.329303687758528, "learning_rate": 9.335512963221732e-07, - "loss": 0.171, + "loss": 0.1705, "step": 5865 }, { "epoch": 0.8024623803009576, - "grad_norm": 1.0515583410515155, + "grad_norm": 1.0273545831998758, "learning_rate": 9.323013547310173e-07, - "loss": 0.1655, + "loss": 0.1638, "step": 5866 }, { "epoch": 0.8025991792065663, - "grad_norm": 1.2145986488334384, + "grad_norm": 1.2095703676967455, "learning_rate": 9.310521644404973e-07, - "loss": 0.1891, + "loss": 0.1906, "step": 5867 }, { "epoch": 0.8027359781121751, - "grad_norm": 1.4066585905480042, + "grad_norm": 1.3969395506567526, "learning_rate": 9.298037256813347e-07, - "loss": 0.2202, + "loss": 0.2195, "step": 5868 }, { "epoch": 0.8028727770177838, - "grad_norm": 1.202609703198832, + "grad_norm": 1.1865490433968962, "learning_rate": 9.28556038684118e-07, - "loss": 0.1632, + "loss": 0.1605, "step": 5869 }, { "epoch": 0.8030095759233926, - "grad_norm": 1.2488826054006348, + "grad_norm": 1.2515769746650127, "learning_rate": 9.273091036792914e-07, - "loss": 0.1922, + "loss": 0.1913, "step": 5870 }, { "epoch": 0.8031463748290014, - "grad_norm": 1.2512606266258344, + "grad_norm": 1.253503088715545, "learning_rate": 9.260629208971638e-07, - "loss": 0.1815, + "loss": 0.1832, "step": 5871 }, { "epoch": 0.8032831737346101, - "grad_norm": 1.348756884464498, + "grad_norm": 1.3351166718698657, "learning_rate": 9.248174905679058e-07, - "loss": 0.2062, + "loss": 0.2055, "step": 5872 }, { "epoch": 0.8034199726402189, - "grad_norm": 1.0675065293311061, + "grad_norm": 1.0508616138500555, "learning_rate": 9.235728129215443e-07, - "loss": 0.1558, + "loss": 0.1543, "step": 5873 }, { "epoch": 0.8035567715458276, - "grad_norm": 1.197253494729427, + "grad_norm": 1.1851436407870666, "learning_rate": 9.223288881879739e-07, - "loss": 0.1727, + "loss": 0.1736, "step": 5874 }, { "epoch": 0.8036935704514364, - "grad_norm": 1.1879214003254943, + "grad_norm": 1.1640726038495226, "learning_rate": 9.210857165969423e-07, - "loss": 0.1707, + "loss": 0.1702, "step": 5875 }, { "epoch": 0.8038303693570451, - "grad_norm": 1.1906770009239482, + "grad_norm": 1.1930174116069858, "learning_rate": 9.198432983780659e-07, - "loss": 0.1598, + "loss": 0.1605, "step": 5876 }, { "epoch": 0.8039671682626539, - "grad_norm": 1.4518844069183563, + "grad_norm": 1.4303405952196575, "learning_rate": 9.186016337608156e-07, - "loss": 0.1921, + "loss": 0.1892, "step": 5877 }, { "epoch": 0.8041039671682626, - "grad_norm": 1.329706221337106, + "grad_norm": 1.3437240177198413, "learning_rate": 9.173607229745274e-07, - "loss": 0.2043, + "loss": 0.2064, "step": 5878 }, { "epoch": 0.8042407660738714, - "grad_norm": 1.165944039638437, + "grad_norm": 1.15445531270459, "learning_rate": 9.161205662483963e-07, - "loss": 0.1629, + "loss": 0.162, "step": 5879 }, { "epoch": 0.8043775649794802, - "grad_norm": 1.4153289156965514, + "grad_norm": 1.4163863955917901, "learning_rate": 9.14881163811479e-07, - "loss": 0.1794, + "loss": 0.1804, "step": 5880 }, { "epoch": 0.8045143638850889, - "grad_norm": 1.4159306725999354, + "grad_norm": 1.4228209996000216, "learning_rate": 9.136425158926903e-07, - "loss": 0.2284, + "loss": 0.2272, "step": 5881 }, { "epoch": 0.8046511627906977, - "grad_norm": 1.2491993897118643, + "grad_norm": 1.2500277459636442, "learning_rate": 9.124046227208083e-07, - "loss": 0.1935, + "loss": 0.1934, "step": 5882 }, { "epoch": 0.8047879616963064, - "grad_norm": 1.084825425760423, + "grad_norm": 1.0869173240360461, "learning_rate": 9.111674845244723e-07, - "loss": 0.168, + "loss": 0.1666, "step": 5883 }, { "epoch": 0.8049247606019152, - "grad_norm": 1.399862332424834, + "grad_norm": 1.392152009833468, "learning_rate": 9.099311015321782e-07, - "loss": 0.1813, + "loss": 0.1818, "step": 5884 }, { "epoch": 0.8050615595075239, - "grad_norm": 1.0311208562012297, + "grad_norm": 1.0263304740823547, "learning_rate": 9.08695473972287e-07, - "loss": 0.1602, + "loss": 0.1594, "step": 5885 }, { "epoch": 0.8051983584131327, - "grad_norm": 1.2924998384718138, + "grad_norm": 1.2927080984775199, "learning_rate": 9.074606020730165e-07, - "loss": 0.2105, + "loss": 0.2122, "step": 5886 }, { "epoch": 0.8053351573187415, - "grad_norm": 1.263023000533686, + "grad_norm": 1.2490875962237764, "learning_rate": 9.06226486062447e-07, - "loss": 0.1496, + "loss": 0.1492, "step": 5887 }, { "epoch": 0.8054719562243502, - "grad_norm": 0.9932939111883858, + "grad_norm": 0.98071229468637, "learning_rate": 9.049931261685208e-07, - "loss": 0.1451, + "loss": 0.1443, "step": 5888 }, { "epoch": 0.805608755129959, - "grad_norm": 1.2276322115059035, + "grad_norm": 1.2185794833623513, "learning_rate": 9.037605226190349e-07, - "loss": 0.1637, + "loss": 0.1651, "step": 5889 }, { "epoch": 0.8057455540355677, - "grad_norm": 1.2873317079264863, + "grad_norm": 1.2672084717839915, "learning_rate": 9.025286756416535e-07, - "loss": 0.1916, + "loss": 0.1902, "step": 5890 }, { "epoch": 0.8058823529411765, - "grad_norm": 1.2323989746021333, + "grad_norm": 1.236720093418014, "learning_rate": 9.01297585463895e-07, - "loss": 0.1818, + "loss": 0.1832, "step": 5891 }, { "epoch": 0.8060191518467852, - "grad_norm": 1.1601553280462837, + "grad_norm": 1.1538153606840322, "learning_rate": 9.000672523131432e-07, - "loss": 0.1775, + "loss": 0.178, "step": 5892 }, { "epoch": 0.806155950752394, - "grad_norm": 1.4302617822157837, + "grad_norm": 1.43499259581096, "learning_rate": 8.988376764166367e-07, - "loss": 0.2326, + "loss": 0.2387, "step": 5893 }, { "epoch": 0.8062927496580027, - "grad_norm": 1.5889569862567763, + "grad_norm": 1.5940582070366713, "learning_rate": 8.976088580014797e-07, - "loss": 0.2239, + "loss": 0.2254, "step": 5894 }, { "epoch": 0.8064295485636115, - "grad_norm": 1.3705481165431168, + "grad_norm": 1.349409497186019, "learning_rate": 8.963807972946326e-07, - "loss": 0.2041, + "loss": 0.2043, "step": 5895 }, { "epoch": 0.8065663474692203, - "grad_norm": 1.3160833299963741, + "grad_norm": 1.3075867285071903, "learning_rate": 8.951534945229174e-07, - "loss": 0.205, + "loss": 0.2055, "step": 5896 }, { "epoch": 0.806703146374829, - "grad_norm": 1.176411246009999, + "grad_norm": 1.1938143172156988, "learning_rate": 8.939269499130183e-07, - "loss": 0.1554, + "loss": 0.1574, "step": 5897 }, { "epoch": 0.8068399452804378, - "grad_norm": 1.3030433684185272, + "grad_norm": 1.2864418658476104, "learning_rate": 8.927011636914734e-07, "loss": 0.1823, "step": 5898 }, { "epoch": 0.8069767441860465, - "grad_norm": 1.1836627394535504, + "grad_norm": 1.1589525493681974, "learning_rate": 8.914761360846869e-07, - "loss": 0.1775, + "loss": 0.1768, "step": 5899 }, { "epoch": 0.8071135430916553, - "grad_norm": 1.025450222018945, + "grad_norm": 1.0291035334295575, "learning_rate": 8.902518673189192e-07, - "loss": 0.1441, + "loss": 0.144, "step": 5900 }, { "epoch": 0.8071135430916553, - "eval_loss": 0.17270927131175995, - "eval_runtime": 5.9212, - "eval_samples_per_second": 5.067, - "eval_steps_per_second": 1.351, + "eval_loss": 0.1730515956878662, + "eval_runtime": 5.9186, + "eval_samples_per_second": 5.069, + "eval_steps_per_second": 1.352, "step": 5900 }, { "epoch": 0.807250341997264, - "grad_norm": 1.341179507693022, + "grad_norm": 1.3276500781385052, "learning_rate": 8.89028357620293e-07, - "loss": 0.2163, + "loss": 0.2124, "step": 5901 }, { "epoch": 0.8073871409028728, - "grad_norm": 1.1711273263781081, + "grad_norm": 1.1701400924381797, "learning_rate": 8.878056072147872e-07, - "loss": 0.1808, + "loss": 0.182, "step": 5902 }, { "epoch": 0.8075239398084816, - "grad_norm": 1.2662088784918781, + "grad_norm": 1.2411316687978902, "learning_rate": 8.865836163282443e-07, - "loss": 0.187, + "loss": 0.185, "step": 5903 }, { "epoch": 0.8076607387140903, - "grad_norm": 1.0717677816193918, + "grad_norm": 1.0569430462635796, "learning_rate": 8.853623851863663e-07, - "loss": 0.1386, + "loss": 0.1385, "step": 5904 }, { "epoch": 0.8077975376196991, - "grad_norm": 1.051482515831241, + "grad_norm": 1.0535682195854754, "learning_rate": 8.841419140147112e-07, - "loss": 0.1643, + "loss": 0.1647, "step": 5905 }, { "epoch": 0.8079343365253078, - "grad_norm": 1.186147860186294, + "grad_norm": 1.1765540294963663, "learning_rate": 8.829222030387014e-07, - "loss": 0.1721, + "loss": 0.1693, "step": 5906 }, { "epoch": 0.8080711354309166, - "grad_norm": 1.0533329201610466, + "grad_norm": 1.0435535593646634, "learning_rate": 8.817032524836144e-07, - "loss": 0.1427, + "loss": 0.1426, "step": 5907 }, { "epoch": 0.8082079343365253, - "grad_norm": 1.2108909988220715, + "grad_norm": 1.3172216861464756, "learning_rate": 8.804850625745898e-07, - "loss": 0.1895, + "loss": 0.1931, "step": 5908 }, { "epoch": 0.8083447332421341, - "grad_norm": 1.2182991924500541, + "grad_norm": 1.2069469583313837, "learning_rate": 8.792676335366268e-07, - "loss": 0.1553, + "loss": 0.1559, "step": 5909 }, { "epoch": 0.8084815321477428, - "grad_norm": 1.4129431930618996, + "grad_norm": 1.4003362477999761, "learning_rate": 8.780509655945845e-07, - "loss": 0.1616, + "loss": 0.1613, "step": 5910 }, { "epoch": 0.8086183310533516, - "grad_norm": 1.3668361271964558, + "grad_norm": 1.3646759687520382, "learning_rate": 8.768350589731783e-07, - "loss": 0.214, + "loss": 0.2126, "step": 5911 }, { "epoch": 0.8087551299589604, - "grad_norm": 1.324554367067173, + "grad_norm": 1.296656503994842, "learning_rate": 8.756199138969868e-07, - "loss": 0.1764, + "loss": 0.176, "step": 5912 }, { "epoch": 0.8088919288645691, - "grad_norm": 1.1287190012523654, + "grad_norm": 1.1255395236408192, "learning_rate": 8.744055305904464e-07, - "loss": 0.1665, + "loss": 0.1651, "step": 5913 }, { "epoch": 0.8090287277701779, - "grad_norm": 1.4985678021433546, + "grad_norm": 1.4919392207243247, "learning_rate": 8.731919092778512e-07, - "loss": 0.211, + "loss": 0.2108, "step": 5914 }, { "epoch": 0.8091655266757866, - "grad_norm": 1.2535466499310748, + "grad_norm": 1.2328468238673986, "learning_rate": 8.719790501833581e-07, - "loss": 0.1747, + "loss": 0.1739, "step": 5915 }, { "epoch": 0.8093023255813954, - "grad_norm": 1.1321629243727973, + "grad_norm": 1.1240519972305743, "learning_rate": 8.707669535309793e-07, "loss": 0.1907, "step": 5916 }, { "epoch": 0.8094391244870041, - "grad_norm": 1.255552127791234, + "grad_norm": 1.2401443551248006, "learning_rate": 8.695556195445898e-07, - "loss": 0.1982, + "loss": 0.1975, "step": 5917 }, { "epoch": 0.8095759233926129, - "grad_norm": 1.3830797851345324, + "grad_norm": 1.3537952980550279, "learning_rate": 8.6834504844792e-07, - "loss": 0.2033, + "loss": 0.2014, "step": 5918 }, { "epoch": 0.8097127222982217, - "grad_norm": 1.6486394325188585, + "grad_norm": 1.6424999234231505, "learning_rate": 8.671352404645622e-07, - "loss": 0.2337, + "loss": 0.233, "step": 5919 }, { "epoch": 0.8098495212038304, - "grad_norm": 1.1520867522927623, + "grad_norm": 1.1537764821279133, "learning_rate": 8.659261958179688e-07, - "loss": 0.1627, + "loss": 0.1626, "step": 5920 }, { "epoch": 0.8099863201094392, - "grad_norm": 1.0489271071034234, + "grad_norm": 1.0375068050427558, "learning_rate": 8.64717914731446e-07, - "loss": 0.1393, + "loss": 0.1394, "step": 5921 }, { "epoch": 0.8101231190150479, - "grad_norm": 1.14771661510587, + "grad_norm": 1.137822429665726, "learning_rate": 8.635103974281661e-07, - "loss": 0.1661, + "loss": 0.1652, "step": 5922 }, { "epoch": 0.8102599179206567, - "grad_norm": 1.256890411275232, + "grad_norm": 1.2889953934606393, "learning_rate": 8.623036441311528e-07, - "loss": 0.1639, + "loss": 0.167, "step": 5923 }, { "epoch": 0.8103967168262654, - "grad_norm": 1.3089814782934797, + "grad_norm": 1.3043713033950144, "learning_rate": 8.610976550632944e-07, - "loss": 0.1912, + "loss": 0.1892, "step": 5924 }, { "epoch": 0.8105335157318742, - "grad_norm": 1.1609302053105994, + "grad_norm": 1.145785002239704, "learning_rate": 8.598924304473361e-07, - "loss": 0.1884, + "loss": 0.1873, "step": 5925 }, { "epoch": 0.8106703146374828, - "grad_norm": 1.352082907889746, + "grad_norm": 1.3355958247433666, "learning_rate": 8.586879705058831e-07, - "loss": 0.1572, + "loss": 0.1585, "step": 5926 }, { "epoch": 0.8108071135430917, - "grad_norm": 1.3687577582765997, + "grad_norm": 1.3646151026759148, "learning_rate": 8.574842754613949e-07, - "loss": 0.2079, + "loss": 0.2078, "step": 5927 }, { "epoch": 0.8109439124487005, - "grad_norm": 1.3524608939849003, + "grad_norm": 1.3585645761799474, "learning_rate": 8.562813455361957e-07, - "loss": 0.182, + "loss": 0.1834, "step": 5928 }, { "epoch": 0.8110807113543091, - "grad_norm": 1.2325594538944264, + "grad_norm": 1.2199171828028967, "learning_rate": 8.550791809524655e-07, - "loss": 0.1708, + "loss": 0.1702, "step": 5929 }, { "epoch": 0.811217510259918, - "grad_norm": 1.2294145937458993, + "grad_norm": 1.2252856832737822, "learning_rate": 8.538777819322419e-07, - "loss": 0.1671, + "loss": 0.1668, "step": 5930 }, { "epoch": 0.8113543091655266, - "grad_norm": 1.5710143409146988, + "grad_norm": 1.562961570364678, "learning_rate": 8.526771486974239e-07, - "loss": 0.1888, + "loss": 0.1887, "step": 5931 }, { "epoch": 0.8114911080711354, - "grad_norm": 1.368371271278919, + "grad_norm": 1.3797782195339836, "learning_rate": 8.514772814697653e-07, - "loss": 0.1884, + "loss": 0.1877, "step": 5932 }, { "epoch": 0.8116279069767441, - "grad_norm": 1.3506352306163831, + "grad_norm": 1.3322968407234554, "learning_rate": 8.502781804708826e-07, - "loss": 0.1845, + "loss": 0.1834, "step": 5933 }, { "epoch": 0.8117647058823529, - "grad_norm": 1.3060334634456998, + "grad_norm": 1.2820741849588626, "learning_rate": 8.490798459222477e-07, - "loss": 0.187, + "loss": 0.1872, "step": 5934 }, { "epoch": 0.8119015047879617, - "grad_norm": 1.1962763232084845, + "grad_norm": 1.184288614992429, "learning_rate": 8.478822780451917e-07, - "loss": 0.1794, + "loss": 0.1788, "step": 5935 }, { "epoch": 0.8120383036935704, - "grad_norm": 1.2665607340783067, + "grad_norm": 1.2655114088435169, "learning_rate": 8.466854770609062e-07, - "loss": 0.1995, + "loss": 0.1992, "step": 5936 }, { "epoch": 0.8121751025991792, - "grad_norm": 1.2391608036744945, + "grad_norm": 1.2283061209616806, "learning_rate": 8.454894431904371e-07, - "loss": 0.167, + "loss": 0.1674, "step": 5937 }, { "epoch": 0.8123119015047879, - "grad_norm": 1.1157658591260413, + "grad_norm": 1.1226294506465546, "learning_rate": 8.442941766546919e-07, - "loss": 0.1491, + "loss": 0.1504, "step": 5938 }, { "epoch": 0.8124487004103967, - "grad_norm": 1.3908336998602364, + "grad_norm": 1.3893418343435167, "learning_rate": 8.43099677674436e-07, - "loss": 0.1681, + "loss": 0.1668, "step": 5939 }, { "epoch": 0.8125854993160054, - "grad_norm": 1.380075238366052, + "grad_norm": 1.3664959062134978, "learning_rate": 8.419059464702928e-07, - "loss": 0.1663, + "loss": 0.1679, "step": 5940 }, { "epoch": 0.8127222982216142, - "grad_norm": 1.2821323852779543, + "grad_norm": 1.2790500228484725, "learning_rate": 8.407129832627409e-07, - "loss": 0.1921, + "loss": 0.1932, "step": 5941 }, { "epoch": 0.8128590971272229, - "grad_norm": 1.3028993474403396, + "grad_norm": 1.2844857344377298, "learning_rate": 8.395207882721229e-07, - "loss": 0.194, + "loss": 0.1934, "step": 5942 }, { "epoch": 0.8129958960328317, - "grad_norm": 1.326192077655648, + "grad_norm": 1.3139847769040096, "learning_rate": 8.383293617186333e-07, - "loss": 0.1931, + "loss": 0.1917, "step": 5943 }, { "epoch": 0.8131326949384405, - "grad_norm": 1.3034489216672314, + "grad_norm": 1.271340875567643, "learning_rate": 8.371387038223289e-07, - "loss": 0.2008, + "loss": 0.1985, "step": 5944 }, { "epoch": 0.8132694938440492, - "grad_norm": 1.1827409488310414, + "grad_norm": 1.17001986175929, "learning_rate": 8.359488148031242e-07, - "loss": 0.1705, + "loss": 0.1697, "step": 5945 }, { "epoch": 0.813406292749658, - "grad_norm": 1.3711195082679135, + "grad_norm": 1.6396673697631403, "learning_rate": 8.347596948807891e-07, - "loss": 0.1683, + "loss": 0.1779, "step": 5946 }, { "epoch": 0.8135430916552667, - "grad_norm": 1.1941561690498788, + "grad_norm": 1.1858762512863632, "learning_rate": 8.335713442749544e-07, - "loss": 0.1404, + "loss": 0.1407, "step": 5947 }, { "epoch": 0.8136798905608755, - "grad_norm": 1.383026289795845, + "grad_norm": 1.3968712348075856, "learning_rate": 8.323837632051063e-07, - "loss": 0.1894, + "loss": 0.19, "step": 5948 }, { "epoch": 0.8138166894664842, - "grad_norm": 1.1505978927211475, + "grad_norm": 1.1469533566475127, "learning_rate": 8.311969518905915e-07, - "loss": 0.1531, + "loss": 0.1538, "step": 5949 }, { "epoch": 0.813953488372093, - "grad_norm": 1.234601323665652, + "grad_norm": 1.2321684306393084, "learning_rate": 8.30010910550611e-07, - "loss": 0.1713, + "loss": 0.1709, "step": 5950 }, { "epoch": 0.8140902872777018, - "grad_norm": 1.054935011083616, + "grad_norm": 1.0610759637841967, "learning_rate": 8.288256394042272e-07, "loss": 0.1381, "step": 5951 }, { "epoch": 0.8142270861833105, - "grad_norm": 1.0416441022540754, + "grad_norm": 0.9697060527275896, "learning_rate": 8.276411386703581e-07, - "loss": 0.1525, + "loss": 0.1497, "step": 5952 }, { "epoch": 0.8143638850889193, - "grad_norm": 1.4962399946943752, + "grad_norm": 1.4903178777638442, "learning_rate": 8.264574085677818e-07, - "loss": 0.1638, + "loss": 0.1637, "step": 5953 }, { "epoch": 0.814500683994528, - "grad_norm": 1.1801095651714735, + "grad_norm": 1.1711409150775252, "learning_rate": 8.252744493151293e-07, - "loss": 0.1675, + "loss": 0.1667, "step": 5954 }, { "epoch": 0.8146374829001368, - "grad_norm": 1.1013635600430551, + "grad_norm": 1.1054244698413778, "learning_rate": 8.240922611308938e-07, - "loss": 0.1628, + "loss": 0.1631, "step": 5955 }, { "epoch": 0.8147742818057455, - "grad_norm": 1.2826418080185613, + "grad_norm": 1.2817449600886348, "learning_rate": 8.229108442334255e-07, - "loss": 0.1848, + "loss": 0.1868, "step": 5956 }, { "epoch": 0.8149110807113543, - "grad_norm": 1.248312368054949, + "grad_norm": 1.2387182043239675, "learning_rate": 8.217301988409293e-07, - "loss": 0.1818, + "loss": 0.1823, "step": 5957 }, { "epoch": 0.815047879616963, - "grad_norm": 1.219928313446224, + "grad_norm": 1.1955075257487393, "learning_rate": 8.205503251714714e-07, - "loss": 0.1769, + "loss": 0.1747, "step": 5958 }, { "epoch": 0.8151846785225718, - "grad_norm": 1.133000362530699, + "grad_norm": 1.1253239353326463, "learning_rate": 8.193712234429708e-07, - "loss": 0.1573, + "loss": 0.1563, "step": 5959 }, { "epoch": 0.8153214774281806, - "grad_norm": 1.5683300993327265, + "grad_norm": 1.5742949917003164, "learning_rate": 8.181928938732081e-07, - "loss": 0.188, + "loss": 0.1905, "step": 5960 }, { "epoch": 0.8154582763337893, - "grad_norm": 1.458092242757033, + "grad_norm": 1.461614329445475, "learning_rate": 8.170153366798211e-07, - "loss": 0.222, + "loss": 0.2243, "step": 5961 }, { "epoch": 0.8155950752393981, - "grad_norm": 1.0023492832469247, + "grad_norm": 0.9865137358221097, "learning_rate": 8.158385520803013e-07, - "loss": 0.1508, + "loss": 0.15, "step": 5962 }, { "epoch": 0.8157318741450068, - "grad_norm": 1.2758874885808977, + "grad_norm": 1.2716352165350058, "learning_rate": 8.146625402920027e-07, - "loss": 0.187, + "loss": 0.1907, "step": 5963 }, { "epoch": 0.8158686730506156, - "grad_norm": 1.219486232490004, + "grad_norm": 1.201442051568582, "learning_rate": 8.134873015321304e-07, - "loss": 0.1547, + "loss": 0.1552, "step": 5964 }, { "epoch": 0.8160054719562243, - "grad_norm": 1.3444544470628088, + "grad_norm": 1.310496479245629, "learning_rate": 8.123128360177523e-07, - "loss": 0.1868, + "loss": 0.1862, "step": 5965 }, { "epoch": 0.8161422708618331, - "grad_norm": 1.2053282585536162, + "grad_norm": 1.195970341478226, "learning_rate": 8.111391439657901e-07, - "loss": 0.1835, + "loss": 0.183, "step": 5966 }, { "epoch": 0.8162790697674419, - "grad_norm": 1.1195556218982572, + "grad_norm": 1.1218350112472404, "learning_rate": 8.09966225593024e-07, - "loss": 0.1592, + "loss": 0.1607, "step": 5967 }, { "epoch": 0.8164158686730506, - "grad_norm": 1.1870679854620698, + "grad_norm": 1.1727004343419132, "learning_rate": 8.087940811160916e-07, - "loss": 0.1622, + "loss": 0.1618, "step": 5968 }, { "epoch": 0.8165526675786594, - "grad_norm": 1.5607990188832463, + "grad_norm": 1.547122836604084, "learning_rate": 8.076227107514867e-07, - "loss": 0.2362, + "loss": 0.2342, "step": 5969 }, { "epoch": 0.8166894664842681, - "grad_norm": 1.2766570636309664, + "grad_norm": 1.2832531958295226, "learning_rate": 8.064521147155613e-07, - "loss": 0.1797, + "loss": 0.1795, "step": 5970 }, { "epoch": 0.8168262653898769, - "grad_norm": 1.3073448194419042, + "grad_norm": 1.3018517518157853, "learning_rate": 8.052822932245219e-07, - "loss": 0.1786, + "loss": 0.178, "step": 5971 }, { "epoch": 0.8169630642954856, - "grad_norm": 1.1866409985378243, + "grad_norm": 1.180572055133492, "learning_rate": 8.041132464944351e-07, - "loss": 0.1862, + "loss": 0.1882, "step": 5972 }, { "epoch": 0.8170998632010944, - "grad_norm": 1.5307374757548773, + "grad_norm": 1.4864651256257155, "learning_rate": 8.029449747412215e-07, - "loss": 0.2101, + "loss": 0.2115, "step": 5973 }, { "epoch": 0.8172366621067031, - "grad_norm": 1.0884551203048995, + "grad_norm": 1.0846717185223018, "learning_rate": 8.01777478180662e-07, - "loss": 0.1605, + "loss": 0.1609, "step": 5974 }, { "epoch": 0.8173734610123119, - "grad_norm": 1.160472782330282, + "grad_norm": 1.1526233943796294, "learning_rate": 8.006107570283894e-07, "loss": 0.1727, "step": 5975 }, { "epoch": 0.8175102599179207, - "grad_norm": 1.2879093566364457, + "grad_norm": 1.2830211401967058, "learning_rate": 7.994448114998976e-07, - "loss": 0.2018, + "loss": 0.2023, "step": 5976 }, { "epoch": 0.8176470588235294, - "grad_norm": 1.559797735582304, + "grad_norm": 1.5384487713814892, "learning_rate": 7.98279641810537e-07, - "loss": 0.2012, + "loss": 0.2004, "step": 5977 }, { "epoch": 0.8177838577291382, - "grad_norm": 1.21611482619112, + "grad_norm": 1.2011619703709584, "learning_rate": 7.971152481755118e-07, - "loss": 0.1716, + "loss": 0.1735, "step": 5978 }, { "epoch": 0.8179206566347469, - "grad_norm": 1.3092787334139186, + "grad_norm": 1.2902291231486802, "learning_rate": 7.959516308098857e-07, - "loss": 0.1723, + "loss": 0.1708, "step": 5979 }, { "epoch": 0.8180574555403557, - "grad_norm": 1.3016400114670938, + "grad_norm": 1.3003820872043985, "learning_rate": 7.947887899285762e-07, - "loss": 0.21, + "loss": 0.2112, "step": 5980 }, { "epoch": 0.8181942544459644, - "grad_norm": 1.2722979278607676, + "grad_norm": 1.2615262420556586, "learning_rate": 7.936267257463598e-07, - "loss": 0.199, + "loss": 0.1984, "step": 5981 }, { "epoch": 0.8183310533515732, - "grad_norm": 1.1104781680691465, + "grad_norm": 1.1083655881590149, "learning_rate": 7.924654384778696e-07, - "loss": 0.1631, + "loss": 0.1633, "step": 5982 }, { "epoch": 0.818467852257182, - "grad_norm": 1.344502401786279, + "grad_norm": 1.3216292676574095, "learning_rate": 7.913049283375951e-07, - "loss": 0.1793, + "loss": 0.1776, "step": 5983 }, { "epoch": 0.8186046511627907, - "grad_norm": 1.1649224999328054, + "grad_norm": 1.146515626054225, "learning_rate": 7.901451955398792e-07, - "loss": 0.1732, + "loss": 0.1738, "step": 5984 }, { "epoch": 0.8187414500683995, - "grad_norm": 1.3598456988433427, + "grad_norm": 1.3166475700404097, "learning_rate": 7.889862402989252e-07, - "loss": 0.1852, + "loss": 0.1859, "step": 5985 }, { "epoch": 0.8188782489740082, - "grad_norm": 1.475102618912656, + "grad_norm": 1.4696344361410465, "learning_rate": 7.878280628287915e-07, - "loss": 0.2349, + "loss": 0.237, "step": 5986 }, { "epoch": 0.819015047879617, - "grad_norm": 1.2910409875965523, + "grad_norm": 1.272483873154567, "learning_rate": 7.866706633433913e-07, - "loss": 0.2057, + "loss": 0.2061, "step": 5987 }, { "epoch": 0.8191518467852257, - "grad_norm": 1.1861447882229055, + "grad_norm": 1.7389465554773276, "learning_rate": 7.855140420564966e-07, - "loss": 0.2122, + "loss": 0.2123, "step": 5988 }, { "epoch": 0.8192886456908345, - "grad_norm": 1.42473392135676, + "grad_norm": 1.3944616531207568, "learning_rate": 7.843581991817329e-07, - "loss": 0.2132, + "loss": 0.2099, "step": 5989 }, { "epoch": 0.8194254445964432, - "grad_norm": 1.242522669549841, + "grad_norm": 1.255312649059927, "learning_rate": 7.832031349325853e-07, - "loss": 0.1559, + "loss": 0.1573, "step": 5990 }, { "epoch": 0.819562243502052, - "grad_norm": 1.0876574406567716, + "grad_norm": 1.0832347026477058, "learning_rate": 7.82048849522391e-07, - "loss": 0.1649, + "loss": 0.165, "step": 5991 }, { "epoch": 0.8196990424076608, - "grad_norm": 1.218846223768647, + "grad_norm": 1.2098798268273097, "learning_rate": 7.808953431643468e-07, - "loss": 0.1682, + "loss": 0.1678, "step": 5992 }, { "epoch": 0.8198358413132695, - "grad_norm": 1.4963720764369213, + "grad_norm": 1.489232871312825, "learning_rate": 7.797426160715055e-07, - "loss": 0.181, + "loss": 0.1805, "step": 5993 }, { "epoch": 0.8199726402188783, - "grad_norm": 1.4431831944909432, + "grad_norm": 1.4308097305776333, "learning_rate": 7.785906684567729e-07, - "loss": 0.2032, + "loss": 0.2036, "step": 5994 }, { "epoch": 0.820109439124487, - "grad_norm": 1.1779217983596653, + "grad_norm": 1.1863228300447426, "learning_rate": 7.774395005329144e-07, - "loss": 0.1483, + "loss": 0.1486, "step": 5995 }, { "epoch": 0.8202462380300958, - "grad_norm": 1.228122093250193, + "grad_norm": 1.2092261352297673, "learning_rate": 7.762891125125477e-07, - "loss": 0.1612, + "loss": 0.1618, "step": 5996 }, { "epoch": 0.8203830369357045, - "grad_norm": 1.2388674612767336, + "grad_norm": 1.226246866726787, "learning_rate": 7.751395046081505e-07, - "loss": 0.1547, + "loss": 0.1565, "step": 5997 }, { "epoch": 0.8205198358413133, - "grad_norm": 1.1521599517293841, + "grad_norm": 1.14493202111646, "learning_rate": 7.739906770320532e-07, - "loss": 0.1936, + "loss": 0.1932, "step": 5998 }, { "epoch": 0.8206566347469221, - "grad_norm": 1.2585757701551528, + "grad_norm": 1.2567941765456032, "learning_rate": 7.728426299964443e-07, - "loss": 0.1466, + "loss": 0.1456, "step": 5999 }, { "epoch": 0.8207934336525308, - "grad_norm": 1.3445356374074895, + "grad_norm": 1.3296277582702103, "learning_rate": 7.716953637133678e-07, - "loss": 0.1628, + "loss": 0.1597, "step": 6000 }, { "epoch": 0.8207934336525308, - "eval_loss": 0.17283496260643005, - "eval_runtime": 5.9378, - "eval_samples_per_second": 5.052, - "eval_steps_per_second": 1.347, + "eval_loss": 0.1729241907596588, + "eval_runtime": 5.9171, + "eval_samples_per_second": 5.07, + "eval_steps_per_second": 1.352, "step": 6000 }, { "epoch": 0.8209302325581396, - "grad_norm": 1.5603045821424641, + "grad_norm": 1.5350376143806486, "learning_rate": 7.705488783947201e-07, - "loss": 0.1995, + "loss": 0.1983, "step": 6001 }, { "epoch": 0.8210670314637483, - "grad_norm": 1.2486840017403562, + "grad_norm": 1.2309984126483198, "learning_rate": 7.694031742522595e-07, - "loss": 0.172, + "loss": 0.1709, "step": 6002 }, { "epoch": 0.8212038303693571, - "grad_norm": 1.2290645430400553, + "grad_norm": 1.2457696346100247, "learning_rate": 7.682582514975939e-07, - "loss": 0.1805, + "loss": 0.1826, "step": 6003 }, { "epoch": 0.8213406292749658, - "grad_norm": 1.3783318981241377, + "grad_norm": 1.3474031796224082, "learning_rate": 7.67114110342192e-07, - "loss": 0.1913, + "loss": 0.1907, "step": 6004 }, { "epoch": 0.8214774281805746, - "grad_norm": 1.316006217962568, + "grad_norm": 1.3144798891912919, "learning_rate": 7.659707509973724e-07, "loss": 0.1841, "step": 6005 }, { "epoch": 0.8216142270861833, - "grad_norm": 1.499409125453729, + "grad_norm": 1.4672157721231738, "learning_rate": 7.648281736743162e-07, "loss": 0.1886, "step": 6006 }, { "epoch": 0.8217510259917921, - "grad_norm": 1.1852067920943885, + "grad_norm": 1.1771324786928523, "learning_rate": 7.636863785840542e-07, - "loss": 0.1841, + "loss": 0.1859, "step": 6007 }, { "epoch": 0.8218878248974009, - "grad_norm": 1.2948581963685597, + "grad_norm": 1.2962989135281164, "learning_rate": 7.625453659374754e-07, - "loss": 0.1824, + "loss": 0.1825, "step": 6008 }, { "epoch": 0.8220246238030096, - "grad_norm": 1.1842917107111102, + "grad_norm": 1.1647633491085356, "learning_rate": 7.614051359453251e-07, - "loss": 0.1769, + "loss": 0.1757, "step": 6009 }, { "epoch": 0.8221614227086184, - "grad_norm": 1.1933544138571155, + "grad_norm": 1.180317568465936, "learning_rate": 7.602656888182014e-07, - "loss": 0.1818, + "loss": 0.1801, "step": 6010 }, { "epoch": 0.822298221614227, - "grad_norm": 1.4271350862928516, + "grad_norm": 1.4072584173963956, "learning_rate": 7.591270247665595e-07, - "loss": 0.2172, + "loss": 0.2143, "step": 6011 }, { "epoch": 0.8224350205198359, - "grad_norm": 1.2821076046798754, + "grad_norm": 1.272135607676794, "learning_rate": 7.579891440007103e-07, - "loss": 0.1919, + "loss": 0.1924, "step": 6012 }, { "epoch": 0.8225718194254446, - "grad_norm": 1.25046324456197, + "grad_norm": 1.2370056233760631, "learning_rate": 7.568520467308205e-07, - "loss": 0.1673, + "loss": 0.1676, "step": 6013 }, { "epoch": 0.8227086183310534, - "grad_norm": 1.2005440590246386, + "grad_norm": 1.2004975686148693, "learning_rate": 7.557157331669085e-07, - "loss": 0.1455, + "loss": 0.1447, "step": 6014 }, { "epoch": 0.8228454172366622, - "grad_norm": 1.1714910745454998, + "grad_norm": 1.1644558356429344, "learning_rate": 7.545802035188515e-07, - "loss": 0.152, + "loss": 0.1512, "step": 6015 }, { "epoch": 0.8229822161422709, - "grad_norm": 1.2645132185109291, + "grad_norm": 1.2642266545522152, "learning_rate": 7.53445457996383e-07, - "loss": 0.1806, + "loss": 0.1809, "step": 6016 }, { "epoch": 0.8231190150478797, - "grad_norm": 0.9965650711092898, + "grad_norm": 0.9821484028621559, "learning_rate": 7.523114968090862e-07, - "loss": 0.1807, + "loss": 0.1798, "step": 6017 }, { "epoch": 0.8232558139534883, - "grad_norm": 1.53888707925514, + "grad_norm": 1.5007920318900234, "learning_rate": 7.511783201664053e-07, - "loss": 0.189, + "loss": 0.1865, "step": 6018 }, { "epoch": 0.8233926128590972, - "grad_norm": 1.4598074202063658, + "grad_norm": 1.4396282330142212, "learning_rate": 7.500459282776351e-07, - "loss": 0.189, + "loss": 0.1894, "step": 6019 }, { "epoch": 0.8235294117647058, - "grad_norm": 1.4320782333843265, + "grad_norm": 1.4260942352304093, "learning_rate": 7.489143213519301e-07, - "loss": 0.2022, + "loss": 0.2025, "step": 6020 }, { "epoch": 0.8236662106703146, - "grad_norm": 1.2934678496406162, + "grad_norm": 1.2811292315253249, "learning_rate": 7.477834995982941e-07, - "loss": 0.1611, + "loss": 0.1618, "step": 6021 }, { "epoch": 0.8238030095759233, - "grad_norm": 1.2593028631583791, + "grad_norm": 1.2563966059549465, "learning_rate": 7.466534632255917e-07, - "loss": 0.1641, + "loss": 0.1657, "step": 6022 }, { "epoch": 0.8239398084815321, - "grad_norm": 1.1355027967411706, + "grad_norm": 1.1333158608035607, "learning_rate": 7.455242124425377e-07, - "loss": 0.1721, + "loss": 0.1722, "step": 6023 }, { "epoch": 0.824076607387141, - "grad_norm": 1.220612436629235, + "grad_norm": 1.220478848031318, "learning_rate": 7.44395747457704e-07, - "loss": 0.1914, + "loss": 0.1898, "step": 6024 }, { "epoch": 0.8242134062927496, - "grad_norm": 1.1074355328873051, + "grad_norm": 1.1104013107797381, "learning_rate": 7.432680684795196e-07, - "loss": 0.1459, + "loss": 0.147, "step": 6025 }, { "epoch": 0.8243502051983584, - "grad_norm": 1.1620824489123345, + "grad_norm": 1.170291844431054, "learning_rate": 7.421411757162622e-07, - "loss": 0.1778, + "loss": 0.1785, "step": 6026 }, { "epoch": 0.8244870041039671, - "grad_norm": 1.2880772637091504, + "grad_norm": 1.290489526657506, "learning_rate": 7.410150693760704e-07, - "loss": 0.1946, + "loss": 0.1964, "step": 6027 }, { "epoch": 0.8246238030095759, - "grad_norm": 1.4603403735656637, + "grad_norm": 1.4573293808532106, "learning_rate": 7.398897496669338e-07, - "loss": 0.1872, + "loss": 0.1874, "step": 6028 }, { "epoch": 0.8247606019151846, - "grad_norm": 1.105305298518825, + "grad_norm": 1.1163097096056418, "learning_rate": 7.387652167967008e-07, - "loss": 0.1423, + "loss": 0.1434, "step": 6029 }, { "epoch": 0.8248974008207934, - "grad_norm": 1.2586820215301995, + "grad_norm": 1.2448278236909058, "learning_rate": 7.376414709730679e-07, - "loss": 0.1411, + "loss": 0.1409, "step": 6030 }, { "epoch": 0.8250341997264022, - "grad_norm": 1.218983946170611, + "grad_norm": 1.2233504848866317, "learning_rate": 7.365185124035923e-07, - "loss": 0.1918, + "loss": 0.1945, "step": 6031 }, { "epoch": 0.8251709986320109, - "grad_norm": 1.3133742782865474, + "grad_norm": 1.2965240114675038, "learning_rate": 7.353963412956838e-07, - "loss": 0.1909, + "loss": 0.195, "step": 6032 }, { "epoch": 0.8253077975376197, - "grad_norm": 1.1519873364768776, + "grad_norm": 1.1276254474624647, "learning_rate": 7.342749578566044e-07, - "loss": 0.1735, + "loss": 0.1726, "step": 6033 }, { "epoch": 0.8254445964432284, - "grad_norm": 1.3389225679568386, + "grad_norm": 1.3216094064518393, "learning_rate": 7.331543622934756e-07, - "loss": 0.1594, + "loss": 0.1574, "step": 6034 }, { "epoch": 0.8255813953488372, - "grad_norm": 1.3514937320221994, + "grad_norm": 1.332080477306039, "learning_rate": 7.320345548132679e-07, - "loss": 0.1794, + "loss": 0.1784, "step": 6035 }, { "epoch": 0.8257181942544459, - "grad_norm": 1.131713797878076, + "grad_norm": 1.1299009393124992, "learning_rate": 7.309155356228109e-07, - "loss": 0.1757, + "loss": 0.1763, "step": 6036 }, { "epoch": 0.8258549931600547, - "grad_norm": 1.432514136003273, + "grad_norm": 1.4317472512867915, "learning_rate": 7.297973049287843e-07, - "loss": 0.2182, + "loss": 0.2203, "step": 6037 }, { "epoch": 0.8259917920656634, - "grad_norm": 1.266923094313876, + "grad_norm": 1.2507334107789507, "learning_rate": 7.286798629377273e-07, - "loss": 0.1667, + "loss": 0.1666, "step": 6038 }, { "epoch": 0.8261285909712722, - "grad_norm": 1.3193075917065993, + "grad_norm": 1.3145817821604018, "learning_rate": 7.27563209856027e-07, - "loss": 0.2049, + "loss": 0.2069, "step": 6039 }, { "epoch": 0.826265389876881, - "grad_norm": 1.3080486362131387, + "grad_norm": 1.3057698516206078, "learning_rate": 7.264473458899301e-07, - "loss": 0.1517, + "loss": 0.1526, "step": 6040 }, { "epoch": 0.8264021887824897, - "grad_norm": 1.1307274707000021, + "grad_norm": 1.1283805459564649, "learning_rate": 7.253322712455362e-07, - "loss": 0.1509, + "loss": 0.1502, "step": 6041 }, { "epoch": 0.8265389876880985, - "grad_norm": 1.1449121562288573, + "grad_norm": 1.1617199105989664, "learning_rate": 7.242179861287985e-07, - "loss": 0.1541, + "loss": 0.1567, "step": 6042 }, { "epoch": 0.8266757865937072, - "grad_norm": 1.2105915000541325, + "grad_norm": 1.1999852761359102, "learning_rate": 7.231044907455254e-07, - "loss": 0.1813, + "loss": 0.181, "step": 6043 }, { "epoch": 0.826812585499316, - "grad_norm": 0.9736961606035016, + "grad_norm": 0.9760899248722147, "learning_rate": 7.219917853013763e-07, - "loss": 0.1627, + "loss": 0.1635, "step": 6044 }, { "epoch": 0.8269493844049247, - "grad_norm": 0.9820549985829575, + "grad_norm": 0.9789591099692336, "learning_rate": 7.208798700018693e-07, - "loss": 0.1515, + "loss": 0.1526, "step": 6045 }, { "epoch": 0.8270861833105335, - "grad_norm": 1.206418601548227, + "grad_norm": 1.1709129446560198, "learning_rate": 7.197687450523721e-07, - "loss": 0.154, + "loss": 0.1542, "step": 6046 }, { "epoch": 0.8272229822161423, - "grad_norm": 1.4498313513031924, + "grad_norm": 1.4146279111303024, "learning_rate": 7.186584106581096e-07, - "loss": 0.2062, + "loss": 0.2047, "step": 6047 }, { "epoch": 0.827359781121751, - "grad_norm": 1.002588799269683, + "grad_norm": 0.9943387572936082, "learning_rate": 7.175488670241609e-07, - "loss": 0.1411, + "loss": 0.1424, "step": 6048 }, { "epoch": 0.8274965800273598, - "grad_norm": 1.2875948605404024, + "grad_norm": 1.2897863900755002, "learning_rate": 7.164401143554562e-07, - "loss": 0.1597, + "loss": 0.1594, "step": 6049 }, { "epoch": 0.8276333789329685, - "grad_norm": 1.562471553454701, + "grad_norm": 1.563395265731998, "learning_rate": 7.153321528567819e-07, - "loss": 0.2273, + "loss": 0.2274, "step": 6050 }, { "epoch": 0.8277701778385773, - "grad_norm": 1.1644074197019236, + "grad_norm": 1.1501603996519472, "learning_rate": 7.142249827327763e-07, - "loss": 0.1609, + "loss": 0.1606, "step": 6051 }, { "epoch": 0.827906976744186, - "grad_norm": 1.151399896127959, + "grad_norm": 1.15056529623743, "learning_rate": 7.131186041879357e-07, - "loss": 0.1525, + "loss": 0.1523, "step": 6052 }, { "epoch": 0.8280437756497948, - "grad_norm": 1.2600322770773638, + "grad_norm": 1.2489323472441458, "learning_rate": 7.120130174266038e-07, - "loss": 0.1902, + "loss": 0.1905, "step": 6053 }, { "epoch": 0.8281805745554035, - "grad_norm": 1.4651960101138057, + "grad_norm": 1.4633383291461703, "learning_rate": 7.109082226529834e-07, "loss": 0.2438, "step": 6054 }, { "epoch": 0.8283173734610123, - "grad_norm": 1.2299550796370313, + "grad_norm": 1.2317371479123322, "learning_rate": 7.098042200711292e-07, - "loss": 0.1833, + "loss": 0.1851, "step": 6055 }, { "epoch": 0.8284541723666211, - "grad_norm": 1.269041091563579, + "grad_norm": 1.2667392183437907, "learning_rate": 7.0870100988495e-07, - "loss": 0.1561, + "loss": 0.1571, "step": 6056 }, { "epoch": 0.8285909712722298, - "grad_norm": 1.235484654399356, + "grad_norm": 1.2252819736026463, "learning_rate": 7.07598592298207e-07, - "loss": 0.1924, + "loss": 0.1921, "step": 6057 }, { "epoch": 0.8287277701778386, - "grad_norm": 1.3447246879824841, + "grad_norm": 1.3581017580350636, "learning_rate": 7.064969675145156e-07, - "loss": 0.1969, + "loss": 0.197, "step": 6058 }, { "epoch": 0.8288645690834473, - "grad_norm": 1.3351923283674278, + "grad_norm": 1.3373863548589193, "learning_rate": 7.053961357373467e-07, - "loss": 0.1709, + "loss": 0.1712, "step": 6059 }, { "epoch": 0.8290013679890561, - "grad_norm": 1.3695937361790569, + "grad_norm": 1.3693626381163035, "learning_rate": 7.042960971700208e-07, - "loss": 0.2073, + "loss": 0.2068, "step": 6060 }, { "epoch": 0.8291381668946648, - "grad_norm": 1.439601811183791, + "grad_norm": 1.4352628139900712, "learning_rate": 7.031968520157162e-07, - "loss": 0.2192, + "loss": 0.2193, "step": 6061 }, { "epoch": 0.8292749658002736, - "grad_norm": 1.0971893926289442, + "grad_norm": 1.097600904130661, "learning_rate": 7.020984004774605e-07, - "loss": 0.1815, + "loss": 0.184, "step": 6062 }, { "epoch": 0.8294117647058824, - "grad_norm": 1.2799592913694493, + "grad_norm": 1.2815266105451286, "learning_rate": 7.010007427581378e-07, - "loss": 0.1788, + "loss": 0.179, "step": 6063 }, { "epoch": 0.8295485636114911, - "grad_norm": 1.196197658968609, + "grad_norm": 1.1770725430049858, "learning_rate": 6.999038790604856e-07, - "loss": 0.1789, + "loss": 0.1778, "step": 6064 }, { "epoch": 0.8296853625170999, - "grad_norm": 1.1574866665151036, + "grad_norm": 1.1036228736362617, "learning_rate": 6.98807809587092e-07, - "loss": 0.1588, + "loss": 0.1581, "step": 6065 }, { "epoch": 0.8298221614227086, - "grad_norm": 1.1859300205030026, + "grad_norm": 1.1831846208507937, "learning_rate": 6.977125345404018e-07, - "loss": 0.1772, + "loss": 0.1794, "step": 6066 }, { "epoch": 0.8299589603283174, - "grad_norm": 1.4707179431971544, + "grad_norm": 1.4243897539166874, "learning_rate": 6.966180541227096e-07, - "loss": 0.2078, + "loss": 0.2055, "step": 6067 }, { "epoch": 0.8300957592339261, - "grad_norm": 1.0330110137247592, + "grad_norm": 1.0352052229135458, "learning_rate": 6.955243685361673e-07, - "loss": 0.1474, + "loss": 0.1486, "step": 6068 }, { "epoch": 0.8302325581395349, - "grad_norm": 1.3244214969319472, + "grad_norm": 1.3371416873833362, "learning_rate": 6.944314779827749e-07, - "loss": 0.1955, + "loss": 0.1964, "step": 6069 }, { "epoch": 0.8303693570451436, - "grad_norm": 1.5580820757062914, + "grad_norm": 1.547650712659972, "learning_rate": 6.933393826643902e-07, - "loss": 0.2128, + "loss": 0.2136, "step": 6070 }, { "epoch": 0.8305061559507524, - "grad_norm": 1.3287355696121566, + "grad_norm": 1.317069769912313, "learning_rate": 6.922480827827221e-07, - "loss": 0.1688, + "loss": 0.1699, "step": 6071 }, { "epoch": 0.8306429548563612, - "grad_norm": 1.2598552394276776, + "grad_norm": 1.2525942998730368, "learning_rate": 6.911575785393327e-07, - "loss": 0.2076, + "loss": 0.2067, "step": 6072 }, { "epoch": 0.8307797537619699, - "grad_norm": 1.2789800633782133, + "grad_norm": 1.2571731710512084, "learning_rate": 6.900678701356378e-07, - "loss": 0.1788, + "loss": 0.177, "step": 6073 }, { "epoch": 0.8309165526675787, - "grad_norm": 1.3338575733781601, + "grad_norm": 1.3366925257871485, "learning_rate": 6.889789577729044e-07, - "loss": 0.2246, + "loss": 0.2285, "step": 6074 }, { "epoch": 0.8310533515731874, - "grad_norm": 1.3980110244414834, + "grad_norm": 1.3186875636760993, "learning_rate": 6.878908416522562e-07, - "loss": 0.1826, + "loss": 0.1787, "step": 6075 }, { "epoch": 0.8311901504787962, - "grad_norm": 1.1630790374026154, + "grad_norm": 1.1477111803280022, "learning_rate": 6.868035219746639e-07, - "loss": 0.1597, + "loss": 0.1595, "step": 6076 }, { "epoch": 0.8313269493844049, - "grad_norm": 1.350364324615077, + "grad_norm": 1.3340379904925026, "learning_rate": 6.857169989409574e-07, - "loss": 0.2221, + "loss": 0.2234, "step": 6077 }, { "epoch": 0.8314637482900137, - "grad_norm": 1.1841745055784412, + "grad_norm": 1.172979455963, "learning_rate": 6.846312727518145e-07, - "loss": 0.1782, + "loss": 0.1796, "step": 6078 }, { "epoch": 0.8316005471956225, - "grad_norm": 1.3675600640876693, + "grad_norm": 1.337351600934958, "learning_rate": 6.835463436077689e-07, - "loss": 0.2014, + "loss": 0.2035, "step": 6079 }, { "epoch": 0.8317373461012312, - "grad_norm": 1.2760145227805777, + "grad_norm": 1.2747580354737245, "learning_rate": 6.824622117092078e-07, - "loss": 0.1859, + "loss": 0.1878, "step": 6080 }, { "epoch": 0.83187414500684, - "grad_norm": 1.1934919856098756, + "grad_norm": 1.2085983058016405, "learning_rate": 6.813788772563661e-07, - "loss": 0.1517, + "loss": 0.1541, "step": 6081 }, { "epoch": 0.8320109439124487, - "grad_norm": 1.4837026063997947, + "grad_norm": 1.4530284849792618, "learning_rate": 6.802963404493379e-07, - "loss": 0.1957, + "loss": 0.195, "step": 6082 }, { "epoch": 0.8321477428180575, - "grad_norm": 1.2388477310204904, + "grad_norm": 1.2124155537537036, "learning_rate": 6.792146014880646e-07, - "loss": 0.2083, + "loss": 0.2071, "step": 6083 }, { "epoch": 0.8322845417236662, - "grad_norm": 1.1433753922540089, + "grad_norm": 1.1398899763564438, "learning_rate": 6.781336605723432e-07, - "loss": 0.1476, + "loss": 0.1481, "step": 6084 }, { "epoch": 0.832421340629275, - "grad_norm": 1.3933537249231875, + "grad_norm": 1.3760646168123587, "learning_rate": 6.770535179018228e-07, - "loss": 0.1889, + "loss": 0.1873, "step": 6085 }, { "epoch": 0.8325581395348837, - "grad_norm": 1.0514387164733208, + "grad_norm": 1.062102785669445, "learning_rate": 6.759741736760062e-07, - "loss": 0.1713, + "loss": 0.1751, "step": 6086 }, { "epoch": 0.8326949384404925, - "grad_norm": 1.2048557917891394, + "grad_norm": 1.180278425151479, "learning_rate": 6.748956280942453e-07, - "loss": 0.1635, + "loss": 0.1622, "step": 6087 }, { "epoch": 0.8328317373461013, - "grad_norm": 1.124398433463489, + "grad_norm": 1.157675810413994, "learning_rate": 6.738178813557472e-07, - "loss": 0.1647, + "loss": 0.1653, "step": 6088 }, { "epoch": 0.83296853625171, - "grad_norm": 1.3934057032481235, + "grad_norm": 1.3623856504917344, "learning_rate": 6.727409336595719e-07, - "loss": 0.1803, + "loss": 0.179, "step": 6089 }, { "epoch": 0.8331053351573188, - "grad_norm": 1.327696488952939, + "grad_norm": 1.3161562463080718, "learning_rate": 6.716647852046293e-07, - "loss": 0.1728, + "loss": 0.1723, "step": 6090 }, { "epoch": 0.8332421340629275, - "grad_norm": 1.2365619409495985, + "grad_norm": 1.2326662182335666, "learning_rate": 6.705894361896842e-07, - "loss": 0.1814, + "loss": 0.1811, "step": 6091 }, { "epoch": 0.8333789329685363, - "grad_norm": 1.1398602846909205, + "grad_norm": 1.1222475558291898, "learning_rate": 6.695148868133516e-07, - "loss": 0.1846, + "loss": 0.1835, "step": 6092 }, { "epoch": 0.833515731874145, - "grad_norm": 1.2250886656374342, + "grad_norm": 1.219156326551256, "learning_rate": 6.684411372741017e-07, - "loss": 0.1708, + "loss": 0.1699, "step": 6093 }, { "epoch": 0.8336525307797538, - "grad_norm": 1.3721758099539323, + "grad_norm": 1.3773537869413885, "learning_rate": 6.673681877702532e-07, - "loss": 0.2119, + "loss": 0.212, "step": 6094 }, { "epoch": 0.8337893296853626, - "grad_norm": 1.3016362594125417, + "grad_norm": 1.2736683154091675, "learning_rate": 6.662960384999795e-07, - "loss": 0.191, + "loss": 0.1914, "step": 6095 }, { "epoch": 0.8339261285909713, - "grad_norm": 1.3658237913133937, + "grad_norm": 1.3654317598407577, "learning_rate": 6.652246896613068e-07, - "loss": 0.1916, + "loss": 0.1932, "step": 6096 }, { "epoch": 0.8340629274965801, - "grad_norm": 1.3285936464397317, + "grad_norm": 1.2875040755162517, "learning_rate": 6.641541414521114e-07, - "loss": 0.1897, + "loss": 0.1898, "step": 6097 }, { "epoch": 0.8341997264021888, - "grad_norm": 1.475207763734024, + "grad_norm": 1.485746735923004, "learning_rate": 6.630843940701232e-07, - "loss": 0.1953, + "loss": 0.1967, "step": 6098 }, { "epoch": 0.8343365253077976, - "grad_norm": 1.1966415871132312, + "grad_norm": 1.1776573623588056, "learning_rate": 6.620154477129226e-07, - "loss": 0.1306, + "loss": 0.13, "step": 6099 }, { "epoch": 0.8344733242134063, - "grad_norm": 1.3761955234782377, + "grad_norm": 1.3650793424429475, "learning_rate": 6.609473025779433e-07, - "loss": 0.2095, + "loss": 0.2092, "step": 6100 }, { "epoch": 0.8344733242134063, - "eval_loss": 0.17278394103050232, - "eval_runtime": 5.9062, - "eval_samples_per_second": 5.079, - "eval_steps_per_second": 1.355, + "eval_loss": 0.1731017529964447, + "eval_runtime": 5.9108, + "eval_samples_per_second": 5.075, + "eval_steps_per_second": 1.353, "step": 6100 }, { "epoch": 0.8346101231190151, - "grad_norm": 1.2984702887292832, + "grad_norm": 1.3128284328271151, "learning_rate": 6.598799588624721e-07, - "loss": 0.1627, + "loss": 0.1625, "step": 6101 }, { "epoch": 0.8347469220246237, - "grad_norm": 1.1916059339186715, + "grad_norm": 1.174191500124191, "learning_rate": 6.588134167636467e-07, - "loss": 0.1903, + "loss": 0.1914, "step": 6102 }, { "epoch": 0.8348837209302326, - "grad_norm": 1.2326590529047916, + "grad_norm": 1.2146844548944917, "learning_rate": 6.577476764784546e-07, - "loss": 0.1547, + "loss": 0.1556, "step": 6103 }, { "epoch": 0.8350205198358414, - "grad_norm": 1.0862003572667358, + "grad_norm": 1.0734695895664859, "learning_rate": 6.566827382037383e-07, - "loss": 0.1343, + "loss": 0.135, "step": 6104 }, { "epoch": 0.83515731874145, - "grad_norm": 1.3103180923996898, + "grad_norm": 1.2940404440852773, "learning_rate": 6.556186021361915e-07, - "loss": 0.1912, + "loss": 0.1924, "step": 6105 }, { "epoch": 0.8352941176470589, - "grad_norm": 1.2581977742329211, + "grad_norm": 1.2417670601209, "learning_rate": 6.545552684723583e-07, - "loss": 0.1646, + "loss": 0.1645, "step": 6106 }, { "epoch": 0.8354309165526675, - "grad_norm": 1.1182733436121304, + "grad_norm": 1.1146604884974791, "learning_rate": 6.53492737408637e-07, - "loss": 0.1547, + "loss": 0.1542, "step": 6107 }, { "epoch": 0.8355677154582763, - "grad_norm": 1.323153327399721, + "grad_norm": 1.3054289642204417, "learning_rate": 6.524310091412739e-07, - "loss": 0.1615, + "loss": 0.1586, "step": 6108 }, { "epoch": 0.835704514363885, - "grad_norm": 1.314537068994354, + "grad_norm": 1.2920765234087703, "learning_rate": 6.513700838663722e-07, - "loss": 0.2141, + "loss": 0.2121, "step": 6109 }, { "epoch": 0.8358413132694938, - "grad_norm": 1.2999863937147318, + "grad_norm": 1.283714471395015, "learning_rate": 6.503099617798814e-07, - "loss": 0.1598, + "loss": 0.1614, "step": 6110 }, { "epoch": 0.8359781121751027, - "grad_norm": 1.364160765863527, + "grad_norm": 1.312174348679162, "learning_rate": 6.492506430776058e-07, - "loss": 0.1796, + "loss": 0.1786, "step": 6111 }, { "epoch": 0.8361149110807113, - "grad_norm": 1.2295802735508372, + "grad_norm": 1.221833579877448, "learning_rate": 6.481921279552023e-07, - "loss": 0.1858, + "loss": 0.186, "step": 6112 }, { "epoch": 0.8362517099863201, - "grad_norm": 1.4335785502829703, + "grad_norm": 1.384122898274668, "learning_rate": 6.471344166081755e-07, - "loss": 0.1979, + "loss": 0.1951, "step": 6113 }, { "epoch": 0.8363885088919288, - "grad_norm": 1.3130280644997638, + "grad_norm": 1.3199028915670872, "learning_rate": 6.46077509231885e-07, - "loss": 0.218, + "loss": 0.22, "step": 6114 }, { "epoch": 0.8365253077975376, - "grad_norm": 1.191964432763641, + "grad_norm": 1.180351454045801, "learning_rate": 6.450214060215404e-07, - "loss": 0.191, + "loss": 0.1932, "step": 6115 }, { "epoch": 0.8366621067031463, - "grad_norm": 1.448537940566982, + "grad_norm": 1.4359830237845594, "learning_rate": 6.439661071722047e-07, - "loss": 0.2101, + "loss": 0.2114, "step": 6116 }, { "epoch": 0.8367989056087551, - "grad_norm": 1.2408740513412209, + "grad_norm": 1.2416175170983321, "learning_rate": 6.429116128787882e-07, - "loss": 0.1902, + "loss": 0.1909, "step": 6117 }, { "epoch": 0.8369357045143638, - "grad_norm": 1.3702286896494005, + "grad_norm": 1.3561647819791915, "learning_rate": 6.418579233360578e-07, - "loss": 0.1885, + "loss": 0.1887, "step": 6118 }, { "epoch": 0.8370725034199726, - "grad_norm": 1.098996560853008, + "grad_norm": 1.0966153553437903, "learning_rate": 6.408050387386261e-07, - "loss": 0.1735, + "loss": 0.1749, "step": 6119 }, { "epoch": 0.8372093023255814, - "grad_norm": 1.560886190319643, + "grad_norm": 1.5833502046609946, "learning_rate": 6.397529592809615e-07, - "loss": 0.202, + "loss": 0.205, "step": 6120 }, { "epoch": 0.8373461012311901, - "grad_norm": 1.3790466435263604, + "grad_norm": 1.3452900713540892, "learning_rate": 6.387016851573841e-07, - "loss": 0.2043, + "loss": 0.203, "step": 6121 }, { "epoch": 0.8374829001367989, - "grad_norm": 1.2352580619046327, + "grad_norm": 1.237234431380029, "learning_rate": 6.376512165620596e-07, - "loss": 0.1817, + "loss": 0.1809, "step": 6122 }, { "epoch": 0.8376196990424076, - "grad_norm": 1.1252415452334856, + "grad_norm": 1.1132191028331406, "learning_rate": 6.366015536890125e-07, - "loss": 0.1632, + "loss": 0.1617, "step": 6123 }, { "epoch": 0.8377564979480164, - "grad_norm": 1.2220578771236446, + "grad_norm": 1.2167047465233594, "learning_rate": 6.355526967321113e-07, - "loss": 0.1948, + "loss": 0.1959, "step": 6124 }, { "epoch": 0.8378932968536251, - "grad_norm": 1.2821260049707381, + "grad_norm": 1.27841623840939, "learning_rate": 6.345046458850818e-07, - "loss": 0.165, + "loss": 0.1666, "step": 6125 }, { "epoch": 0.8380300957592339, - "grad_norm": 1.2938970309492013, + "grad_norm": 1.2851806093582496, "learning_rate": 6.334574013414957e-07, - "loss": 0.1771, + "loss": 0.177, "step": 6126 }, { "epoch": 0.8381668946648427, - "grad_norm": 1.2446782265659917, + "grad_norm": 1.2286732969268548, "learning_rate": 6.324109632947795e-07, - "loss": 0.1962, + "loss": 0.1972, "step": 6127 }, { "epoch": 0.8383036935704514, - "grad_norm": 1.3993326293094497, + "grad_norm": 1.3786945968642987, "learning_rate": 6.313653319382107e-07, - "loss": 0.1658, + "loss": 0.1655, "step": 6128 }, { "epoch": 0.8384404924760602, - "grad_norm": 1.3334610647874892, + "grad_norm": 1.319509244671278, "learning_rate": 6.303205074649138e-07, - "loss": 0.1612, + "loss": 0.1609, "step": 6129 }, { "epoch": 0.8385772913816689, - "grad_norm": 1.4383527121290578, + "grad_norm": 1.4256324481842892, "learning_rate": 6.292764900678688e-07, - "loss": 0.1901, + "loss": 0.1888, "step": 6130 }, { "epoch": 0.8387140902872777, - "grad_norm": 1.1972396631257551, + "grad_norm": 1.1882712917901805, "learning_rate": 6.282332799399044e-07, - "loss": 0.1533, + "loss": 0.153, "step": 6131 }, { "epoch": 0.8388508891928864, - "grad_norm": 1.3244127677573634, + "grad_norm": 1.301336656152813, "learning_rate": 6.271908772737018e-07, - "loss": 0.1889, + "loss": 0.1883, "step": 6132 }, { "epoch": 0.8389876880984952, - "grad_norm": 1.449322419621713, + "grad_norm": 1.4369409055596571, "learning_rate": 6.261492822617898e-07, - "loss": 0.213, + "loss": 0.2119, "step": 6133 }, { "epoch": 0.8391244870041039, - "grad_norm": 1.428767994901574, + "grad_norm": 1.4102613605265537, "learning_rate": 6.251084950965525e-07, - "loss": 0.1759, + "loss": 0.1755, "step": 6134 }, { "epoch": 0.8392612859097127, - "grad_norm": 1.328513873499661, + "grad_norm": 1.3129264668894893, "learning_rate": 6.240685159702204e-07, - "loss": 0.1771, + "loss": 0.1758, "step": 6135 }, { "epoch": 0.8393980848153215, - "grad_norm": 1.717489791095997, + "grad_norm": 1.6967371723190086, "learning_rate": 6.230293450748775e-07, - "loss": 0.2721, + "loss": 0.274, "step": 6136 }, { "epoch": 0.8395348837209302, - "grad_norm": 1.5198084632422184, + "grad_norm": 1.5253998880415434, "learning_rate": 6.219909826024589e-07, - "loss": 0.2005, + "loss": 0.2029, "step": 6137 }, { "epoch": 0.839671682626539, - "grad_norm": 1.2940311094827937, + "grad_norm": 1.2672975171501146, "learning_rate": 6.209534287447472e-07, - "loss": 0.1933, + "loss": 0.1947, "step": 6138 }, { "epoch": 0.8398084815321477, - "grad_norm": 1.0004120872607698, + "grad_norm": 0.9831844469098325, "learning_rate": 6.1991668369338e-07, - "loss": 0.1604, + "loss": 0.1585, "step": 6139 }, { "epoch": 0.8399452804377565, - "grad_norm": 1.262430028614679, + "grad_norm": 1.239277962036094, "learning_rate": 6.188807476398412e-07, - "loss": 0.1827, + "loss": 0.1821, "step": 6140 }, { "epoch": 0.8400820793433652, - "grad_norm": 1.344000743372036, + "grad_norm": 1.315926426750684, "learning_rate": 6.178456207754696e-07, - "loss": 0.1823, + "loss": 0.1826, "step": 6141 }, { "epoch": 0.840218878248974, - "grad_norm": 1.301265245785484, + "grad_norm": 1.3084553496359985, "learning_rate": 6.168113032914496e-07, - "loss": 0.211, + "loss": 0.2124, "step": 6142 }, { "epoch": 0.8403556771545828, - "grad_norm": 1.263721993026387, + "grad_norm": 1.249954325147041, "learning_rate": 6.157777953788203e-07, - "loss": 0.1693, + "loss": 0.17, "step": 6143 }, { "epoch": 0.8404924760601915, - "grad_norm": 1.4353908891524443, + "grad_norm": 1.4340375136807852, "learning_rate": 6.147450972284697e-07, - "loss": 0.1933, + "loss": 0.1944, "step": 6144 }, { "epoch": 0.8406292749658003, - "grad_norm": 1.0958254052616119, + "grad_norm": 1.089227867405225, "learning_rate": 6.137132090311365e-07, - "loss": 0.1525, + "loss": 0.1532, "step": 6145 }, { "epoch": 0.840766073871409, - "grad_norm": 1.2668558117333721, + "grad_norm": 1.255803715857818, "learning_rate": 6.1268213097741e-07, "loss": 0.1948, "step": 6146 }, { "epoch": 0.8409028727770178, - "grad_norm": 0.9665515976650639, + "grad_norm": 0.9630594154287636, "learning_rate": 6.116518632577284e-07, - "loss": 0.1463, + "loss": 0.1467, "step": 6147 }, { "epoch": 0.8410396716826265, - "grad_norm": 1.0852708685371752, + "grad_norm": 1.0778116984678132, "learning_rate": 6.106224060623822e-07, - "loss": 0.1637, + "loss": 0.1656, "step": 6148 }, { "epoch": 0.8411764705882353, - "grad_norm": 1.3960550568266537, + "grad_norm": 1.3987926252401097, "learning_rate": 6.095937595815104e-07, - "loss": 0.1816, + "loss": 0.18, "step": 6149 }, { "epoch": 0.841313269493844, - "grad_norm": 1.4861923868381386, + "grad_norm": 1.4996907891560967, "learning_rate": 6.085659240051045e-07, - "loss": 0.207, + "loss": 0.2057, "step": 6150 }, { "epoch": 0.8414500683994528, - "grad_norm": 1.2393048406575062, + "grad_norm": 1.2421122792822132, "learning_rate": 6.07538899523003e-07, - "loss": 0.1871, + "loss": 0.1868, "step": 6151 }, { "epoch": 0.8415868673050616, - "grad_norm": 1.2525926205357418, + "grad_norm": 1.2338233829376812, "learning_rate": 6.065126863248977e-07, - "loss": 0.1693, + "loss": 0.1685, "step": 6152 }, { "epoch": 0.8417236662106703, - "grad_norm": 1.425882037383057, + "grad_norm": 1.4049244043427325, "learning_rate": 6.054872846003302e-07, - "loss": 0.2097, + "loss": 0.2077, "step": 6153 }, { "epoch": 0.8418604651162791, - "grad_norm": 1.268015887817325, + "grad_norm": 1.2532297760667483, "learning_rate": 6.044626945386894e-07, - "loss": 0.2033, + "loss": 0.2022, "step": 6154 }, { "epoch": 0.8419972640218878, - "grad_norm": 1.1796188296907955, + "grad_norm": 1.1588988655587238, "learning_rate": 6.034389163292181e-07, - "loss": 0.1952, + "loss": 0.1912, "step": 6155 }, { "epoch": 0.8421340629274966, - "grad_norm": 0.973501675636221, + "grad_norm": 0.9675179784395598, "learning_rate": 6.024159501610055e-07, - "loss": 0.1524, + "loss": 0.153, "step": 6156 }, { "epoch": 0.8422708618331053, - "grad_norm": 0.95232088913725, + "grad_norm": 0.9447053826619073, "learning_rate": 6.013937962229932e-07, - "loss": 0.1457, + "loss": 0.1462, "step": 6157 }, { "epoch": 0.8424076607387141, - "grad_norm": 1.2349180383169707, + "grad_norm": 1.2245033511125847, "learning_rate": 6.00372454703973e-07, - "loss": 0.1823, + "loss": 0.1803, "step": 6158 }, { "epoch": 0.8425444596443229, - "grad_norm": 1.2960615530583057, + "grad_norm": 1.2727665640281802, "learning_rate": 5.993519257925861e-07, - "loss": 0.1758, + "loss": 0.176, "step": 6159 }, { "epoch": 0.8426812585499316, - "grad_norm": 1.3422036707964473, + "grad_norm": 1.3374891339805386, "learning_rate": 5.98332209677322e-07, - "loss": 0.1792, + "loss": 0.1782, "step": 6160 }, { "epoch": 0.8428180574555404, - "grad_norm": 1.1606431918288131, + "grad_norm": 1.1686242854249655, "learning_rate": 5.973133065465214e-07, - "loss": 0.1941, + "loss": 0.1951, "step": 6161 }, { "epoch": 0.8429548563611491, - "grad_norm": 1.2793353808195485, + "grad_norm": 1.2738864849425435, "learning_rate": 5.962952165883773e-07, - "loss": 0.2021, + "loss": 0.2032, "step": 6162 }, { "epoch": 0.8430916552667579, - "grad_norm": 1.3537523830296965, + "grad_norm": 1.342317905824626, "learning_rate": 5.95277939990927e-07, - "loss": 0.1885, + "loss": 0.1841, "step": 6163 }, { "epoch": 0.8432284541723666, - "grad_norm": 1.1555252936386302, + "grad_norm": 1.1311655298884453, "learning_rate": 5.942614769420629e-07, - "loss": 0.152, + "loss": 0.1511, "step": 6164 }, { "epoch": 0.8433652530779754, - "grad_norm": 1.2529472263671757, + "grad_norm": 1.226226126926223, "learning_rate": 5.932458276295234e-07, - "loss": 0.1767, + "loss": 0.1764, "step": 6165 }, { "epoch": 0.8435020519835841, - "grad_norm": 1.0596849455962434, + "grad_norm": 1.0669584790224724, "learning_rate": 5.922309922408998e-07, - "loss": 0.1429, + "loss": 0.1448, "step": 6166 }, { "epoch": 0.8436388508891929, - "grad_norm": 1.2628315758920556, + "grad_norm": 1.246272328754559, "learning_rate": 5.912169709636289e-07, - "loss": 0.1828, + "loss": 0.1822, "step": 6167 }, { "epoch": 0.8437756497948017, - "grad_norm": 1.3740552893159033, + "grad_norm": 1.3566536013685693, "learning_rate": 5.902037639850011e-07, - "loss": 0.2075, + "loss": 0.2101, "step": 6168 }, { "epoch": 0.8439124487004104, - "grad_norm": 1.2909804781799268, + "grad_norm": 1.2801195709534057, "learning_rate": 5.89191371492156e-07, - "loss": 0.157, + "loss": 0.1565, "step": 6169 }, { "epoch": 0.8440492476060192, - "grad_norm": 1.1747447942088092, + "grad_norm": 1.1725856866825717, "learning_rate": 5.88179793672079e-07, - "loss": 0.1881, + "loss": 0.1874, "step": 6170 }, { "epoch": 0.8441860465116279, - "grad_norm": 1.154522390341685, + "grad_norm": 1.1477719404340159, "learning_rate": 5.871690307116107e-07, - "loss": 0.1761, + "loss": 0.1765, "step": 6171 }, { "epoch": 0.8443228454172367, - "grad_norm": 1.195313637459218, + "grad_norm": 1.195796122605059, "learning_rate": 5.86159082797435e-07, - "loss": 0.1561, + "loss": 0.1573, "step": 6172 }, { "epoch": 0.8444596443228454, - "grad_norm": 1.254303367177393, + "grad_norm": 1.2406094035099853, "learning_rate": 5.851499501160907e-07, - "loss": 0.1827, + "loss": 0.1822, "step": 6173 }, { "epoch": 0.8445964432284542, - "grad_norm": 1.135899717026548, + "grad_norm": 1.146720451088128, "learning_rate": 5.841416328539635e-07, - "loss": 0.1425, + "loss": 0.1441, "step": 6174 }, { "epoch": 0.844733242134063, - "grad_norm": 1.3813494323321327, + "grad_norm": 1.3749291222089688, "learning_rate": 5.831341311972876e-07, - "loss": 0.1818, + "loss": 0.1829, "step": 6175 }, { "epoch": 0.8448700410396717, - "grad_norm": 1.1535051730639327, + "grad_norm": 1.1534630923616347, "learning_rate": 5.821274453321501e-07, - "loss": 0.1537, + "loss": 0.1539, "step": 6176 }, { "epoch": 0.8450068399452805, - "grad_norm": 1.257729454536416, + "grad_norm": 1.2492946941229557, "learning_rate": 5.811215754444827e-07, - "loss": 0.1595, + "loss": 0.159, "step": 6177 }, { "epoch": 0.8451436388508892, - "grad_norm": 1.2346412662799533, + "grad_norm": 1.222846527085895, "learning_rate": 5.801165217200705e-07, - "loss": 0.19, + "loss": 0.1924, "step": 6178 }, { "epoch": 0.845280437756498, - "grad_norm": 1.2543458175130857, + "grad_norm": 1.2421719707007395, "learning_rate": 5.791122843445446e-07, - "loss": 0.1797, + "loss": 0.179, "step": 6179 }, { "epoch": 0.8454172366621067, - "grad_norm": 1.0177883817971303, + "grad_norm": 1.0195663004889497, "learning_rate": 5.781088635033883e-07, - "loss": 0.1402, + "loss": 0.14, "step": 6180 }, { "epoch": 0.8455540355677155, - "grad_norm": 1.2221475518441869, + "grad_norm": 1.2156024031268824, "learning_rate": 5.771062593819304e-07, - "loss": 0.1839, + "loss": 0.1851, "step": 6181 }, { "epoch": 0.8456908344733242, - "grad_norm": 1.2933594794752863, + "grad_norm": 1.2724715723906566, "learning_rate": 5.761044721653542e-07, - "loss": 0.1879, + "loss": 0.1854, "step": 6182 }, { "epoch": 0.845827633378933, - "grad_norm": 1.2792460319866064, + "grad_norm": 1.2720323589552598, "learning_rate": 5.751035020386858e-07, - "loss": 0.1933, + "loss": 0.1948, "step": 6183 }, { "epoch": 0.8459644322845418, - "grad_norm": 1.4882318260527028, + "grad_norm": 1.4980341879698762, "learning_rate": 5.741033491868048e-07, - "loss": 0.2391, + "loss": 0.2388, "step": 6184 }, { "epoch": 0.8461012311901505, - "grad_norm": 1.2955150433558222, + "grad_norm": 1.275376303331281, "learning_rate": 5.731040137944399e-07, - "loss": 0.1749, + "loss": 0.1756, "step": 6185 }, { "epoch": 0.8462380300957593, - "grad_norm": 1.2785495029112979, + "grad_norm": 1.285963566558057, "learning_rate": 5.721054960461659e-07, "loss": 0.1876, "step": 6186 }, { "epoch": 0.846374829001368, - "grad_norm": 1.0711155715358864, + "grad_norm": 1.0544261964473243, "learning_rate": 5.711077961264083e-07, - "loss": 0.1623, + "loss": 0.1611, "step": 6187 }, { "epoch": 0.8465116279069768, - "grad_norm": 1.3666904388656564, + "grad_norm": 1.3632252494846455, "learning_rate": 5.701109142194422e-07, - "loss": 0.1845, + "loss": 0.1856, "step": 6188 }, { "epoch": 0.8466484268125855, - "grad_norm": 1.1345452122251531, + "grad_norm": 1.1394726089237204, "learning_rate": 5.691148505093913e-07, - "loss": 0.1618, + "loss": 0.1634, "step": 6189 }, { "epoch": 0.8467852257181943, - "grad_norm": 1.388147689898883, + "grad_norm": 1.3755491516118878, "learning_rate": 5.681196051802268e-07, - "loss": 0.1898, + "loss": 0.1909, "step": 6190 }, { "epoch": 0.8469220246238031, - "grad_norm": 0.9391875461387241, + "grad_norm": 0.9362162877717165, "learning_rate": 5.671251784157694e-07, - "loss": 0.1464, + "loss": 0.1472, "step": 6191 }, { "epoch": 0.8470588235294118, - "grad_norm": 1.0389123489176293, + "grad_norm": 1.0409942281557432, "learning_rate": 5.661315703996905e-07, - "loss": 0.1667, + "loss": 0.1685, "step": 6192 }, { "epoch": 0.8471956224350206, - "grad_norm": 1.1265642600525303, + "grad_norm": 1.1165453883463339, "learning_rate": 5.651387813155068e-07, - "loss": 0.1586, + "loss": 0.1569, "step": 6193 }, { "epoch": 0.8473324213406292, - "grad_norm": 1.2376307194808267, + "grad_norm": 1.241397933195782, "learning_rate": 5.641468113465881e-07, - "loss": 0.1788, + "loss": 0.1796, "step": 6194 }, { "epoch": 0.847469220246238, - "grad_norm": 1.3254009494174879, + "grad_norm": 1.321890021184022, "learning_rate": 5.631556606761474e-07, - "loss": 0.1931, + "loss": 0.1934, "step": 6195 }, { "epoch": 0.8476060191518467, - "grad_norm": 1.1609109400657303, + "grad_norm": 1.1486893736583117, "learning_rate": 5.621653294872515e-07, - "loss": 0.181, + "loss": 0.1795, "step": 6196 }, { "epoch": 0.8477428180574555, - "grad_norm": 1.5125982059337644, + "grad_norm": 1.4998506665205478, "learning_rate": 5.611758179628124e-07, - "loss": 0.228, + "loss": 0.2279, "step": 6197 }, { "epoch": 0.8478796169630642, - "grad_norm": 1.2432775058900942, + "grad_norm": 1.2372661369416622, "learning_rate": 5.60187126285594e-07, - "loss": 0.1725, + "loss": 0.1712, "step": 6198 }, { "epoch": 0.848016415868673, - "grad_norm": 1.1062849101833978, + "grad_norm": 1.1018498869676954, "learning_rate": 5.591992546382053e-07, - "loss": 0.1571, + "loss": 0.1573, "step": 6199 }, { "epoch": 0.8481532147742818, - "grad_norm": 1.4358363199600457, + "grad_norm": 1.4321992600180447, "learning_rate": 5.582122032031051e-07, - "loss": 0.1821, + "loss": 0.1837, "step": 6200 }, { "epoch": 0.8481532147742818, - "eval_loss": 0.17221897840499878, + "eval_loss": 0.17257362604141235, "eval_runtime": 5.9133, "eval_samples_per_second": 5.073, "eval_steps_per_second": 1.353, @@ -43906,7870 +43906,7870 @@ }, { "epoch": 0.8482900136798905, - "grad_norm": 1.107697313321834, + "grad_norm": 1.1086742472374593, "learning_rate": 5.572259721626028e-07, - "loss": 0.1455, + "loss": 0.149, "step": 6201 }, { "epoch": 0.8484268125854993, - "grad_norm": 1.2261845595590406, + "grad_norm": 1.2278216200960907, "learning_rate": 5.56240561698852e-07, - "loss": 0.2004, + "loss": 0.2021, "step": 6202 }, { "epoch": 0.848563611491108, - "grad_norm": 1.3220562956939423, + "grad_norm": 1.3306950646418079, "learning_rate": 5.552559719938589e-07, - "loss": 0.1645, + "loss": 0.1653, "step": 6203 }, { "epoch": 0.8487004103967168, - "grad_norm": 1.0075283587296775, + "grad_norm": 1.005674235506469, "learning_rate": 5.542722032294762e-07, - "loss": 0.1508, + "loss": 0.1524, "step": 6204 }, { "epoch": 0.8488372093023255, - "grad_norm": 1.3247727712556079, + "grad_norm": 1.3014831102505615, "learning_rate": 5.532892555874059e-07, - "loss": 0.2024, + "loss": 0.2032, "step": 6205 }, { "epoch": 0.8489740082079343, - "grad_norm": 1.3461982912371562, + "grad_norm": 1.3364713082455164, "learning_rate": 5.52307129249196e-07, - "loss": 0.1774, + "loss": 0.177, "step": 6206 }, { "epoch": 0.8491108071135431, - "grad_norm": 1.186047025782764, + "grad_norm": 1.1706661609368703, "learning_rate": 5.51325824396245e-07, - "loss": 0.1678, + "loss": 0.1675, "step": 6207 }, { "epoch": 0.8492476060191518, - "grad_norm": 1.2268203065275405, + "grad_norm": 1.1899266946276947, "learning_rate": 5.503453412098003e-07, - "loss": 0.1578, + "loss": 0.1573, "step": 6208 }, { "epoch": 0.8493844049247606, - "grad_norm": 1.4935170959178763, + "grad_norm": 1.4946528211272643, "learning_rate": 5.493656798709545e-07, - "loss": 0.2105, + "loss": 0.2135, "step": 6209 }, { "epoch": 0.8495212038303693, - "grad_norm": 1.2524973112646434, + "grad_norm": 1.2519836711845242, "learning_rate": 5.483868405606524e-07, - "loss": 0.1738, + "loss": 0.175, "step": 6210 }, { "epoch": 0.8496580027359781, - "grad_norm": 1.5024971180779771, + "grad_norm": 1.4803405096458835, "learning_rate": 5.474088234596825e-07, - "loss": 0.2128, + "loss": 0.2107, "step": 6211 }, { "epoch": 0.8497948016415868, - "grad_norm": 1.8107658002647014, + "grad_norm": 1.7660094651315508, "learning_rate": 5.464316287486859e-07, - "loss": 0.3034, + "loss": 0.3014, "step": 6212 }, { "epoch": 0.8499316005471956, - "grad_norm": 1.2189540433109165, + "grad_norm": 1.2072509477647002, "learning_rate": 5.454552566081472e-07, - "loss": 0.1562, + "loss": 0.1566, "step": 6213 }, { "epoch": 0.8500683994528043, - "grad_norm": 1.067711542068536, + "grad_norm": 1.0632520220454418, "learning_rate": 5.444797072184044e-07, - "loss": 0.1581, + "loss": 0.1578, "step": 6214 }, { "epoch": 0.8502051983584131, - "grad_norm": 1.3228353457321, + "grad_norm": 1.315899604109779, "learning_rate": 5.435049807596382e-07, - "loss": 0.1864, + "loss": 0.1855, "step": 6215 }, { "epoch": 0.8503419972640219, - "grad_norm": 1.1584701150386685, + "grad_norm": 1.156387327014229, "learning_rate": 5.425310774118802e-07, - "loss": 0.1637, + "loss": 0.1646, "step": 6216 }, { "epoch": 0.8504787961696306, - "grad_norm": 1.3228664868555744, + "grad_norm": 1.3126954992731228, "learning_rate": 5.415579973550111e-07, - "loss": 0.166, + "loss": 0.1656, "step": 6217 }, { "epoch": 0.8506155950752394, - "grad_norm": 1.4865370817548909, + "grad_norm": 1.4607053664379885, "learning_rate": 5.405857407687565e-07, - "loss": 0.1891, + "loss": 0.1887, "step": 6218 }, { "epoch": 0.8507523939808481, - "grad_norm": 1.1571166366421761, + "grad_norm": 1.170584136186181, "learning_rate": 5.396143078326932e-07, - "loss": 0.1717, + "loss": 0.1733, "step": 6219 }, { "epoch": 0.8508891928864569, - "grad_norm": 1.1663738046767076, + "grad_norm": 1.158345028201089, "learning_rate": 5.386436987262416e-07, - "loss": 0.1671, + "loss": 0.1664, "step": 6220 }, { "epoch": 0.8510259917920656, - "grad_norm": 1.316889842825151, + "grad_norm": 1.281858994395933, "learning_rate": 5.376739136286746e-07, - "loss": 0.1986, + "loss": 0.1977, "step": 6221 }, { "epoch": 0.8511627906976744, - "grad_norm": 1.2317398370825134, + "grad_norm": 1.2124587718177655, "learning_rate": 5.367049527191093e-07, "loss": 0.1766, "step": 6222 }, { "epoch": 0.8512995896032832, - "grad_norm": 1.3083081193430066, + "grad_norm": 1.2874460478796153, "learning_rate": 5.35736816176512e-07, - "loss": 0.1814, + "loss": 0.1805, "step": 6223 }, { "epoch": 0.8514363885088919, - "grad_norm": 1.2409125415782498, + "grad_norm": 1.2244902421515886, "learning_rate": 5.347695041796985e-07, - "loss": 0.1749, + "loss": 0.1738, "step": 6224 }, { "epoch": 0.8515731874145007, - "grad_norm": 1.037287863638327, + "grad_norm": 1.0204987905522744, "learning_rate": 5.338030169073277e-07, - "loss": 0.1488, + "loss": 0.1477, "step": 6225 }, { "epoch": 0.8517099863201094, - "grad_norm": 1.5059349745255934, + "grad_norm": 1.5336758689007002, "learning_rate": 5.328373545379123e-07, - "loss": 0.1903, + "loss": 0.1925, "step": 6226 }, { "epoch": 0.8518467852257182, - "grad_norm": 1.0350062831651117, + "grad_norm": 1.0290630245955132, "learning_rate": 5.318725172498063e-07, - "loss": 0.1631, + "loss": 0.165, "step": 6227 }, { "epoch": 0.8519835841313269, - "grad_norm": 1.1327727566659935, + "grad_norm": 1.1406201561890907, "learning_rate": 5.309085052212165e-07, - "loss": 0.1591, + "loss": 0.1601, "step": 6228 }, { "epoch": 0.8521203830369357, - "grad_norm": 1.033495880777211, + "grad_norm": 1.0303749240955926, "learning_rate": 5.299453186301934e-07, - "loss": 0.1562, + "loss": 0.1568, "step": 6229 }, { "epoch": 0.8522571819425444, - "grad_norm": 1.3139068456994019, + "grad_norm": 1.283762998008711, "learning_rate": 5.289829576546379e-07, - "loss": 0.1686, + "loss": 0.1671, "step": 6230 }, { "epoch": 0.8523939808481532, - "grad_norm": 0.976412572227311, + "grad_norm": 0.9713378471927221, "learning_rate": 5.280214224722968e-07, - "loss": 0.164, + "loss": 0.1634, "step": 6231 }, { "epoch": 0.852530779753762, - "grad_norm": 1.3217300663357323, + "grad_norm": 1.3096414817929722, "learning_rate": 5.270607132607664e-07, - "loss": 0.1675, + "loss": 0.1667, "step": 6232 }, { "epoch": 0.8526675786593707, - "grad_norm": 1.269096445852908, + "grad_norm": 1.2668629586131797, "learning_rate": 5.261008301974868e-07, - "loss": 0.1582, + "loss": 0.1578, "step": 6233 }, { "epoch": 0.8528043775649795, - "grad_norm": 1.2116966876151156, + "grad_norm": 1.21070127665506, "learning_rate": 5.251417734597481e-07, - "loss": 0.1706, + "loss": 0.1718, "step": 6234 }, { "epoch": 0.8529411764705882, - "grad_norm": 1.4499010360807056, + "grad_norm": 1.4664993296717375, "learning_rate": 5.241835432246888e-07, - "loss": 0.2049, + "loss": 0.2064, "step": 6235 }, { "epoch": 0.853077975376197, - "grad_norm": 1.340788265471763, + "grad_norm": 1.325292788965593, "learning_rate": 5.232261396692911e-07, - "loss": 0.1658, + "loss": 0.1651, "step": 6236 }, { "epoch": 0.8532147742818057, - "grad_norm": 1.440472391135183, + "grad_norm": 1.4246307151271917, "learning_rate": 5.222695629703889e-07, - "loss": 0.2183, + "loss": 0.2189, "step": 6237 }, { "epoch": 0.8533515731874145, - "grad_norm": 1.188255151879258, + "grad_norm": 1.1666409856653956, "learning_rate": 5.213138133046586e-07, - "loss": 0.1709, + "loss": 0.1704, "step": 6238 }, { "epoch": 0.8534883720930233, - "grad_norm": 1.279406038595511, + "grad_norm": 1.2647418852011176, "learning_rate": 5.203588908486279e-07, - "loss": 0.1712, + "loss": 0.1691, "step": 6239 }, { "epoch": 0.853625170998632, - "grad_norm": 1.280225278710225, + "grad_norm": 1.2786130940806577, "learning_rate": 5.194047957786713e-07, - "loss": 0.1537, + "loss": 0.1532, "step": 6240 }, { "epoch": 0.8537619699042408, - "grad_norm": 0.9870814422176682, + "grad_norm": 0.9877683863114193, "learning_rate": 5.184515282710067e-07, - "loss": 0.1365, + "loss": 0.1369, "step": 6241 }, { "epoch": 0.8538987688098495, - "grad_norm": 1.4205982832563684, + "grad_norm": 1.4226572364952923, "learning_rate": 5.174990885017045e-07, - "loss": 0.1941, + "loss": 0.1951, "step": 6242 }, { "epoch": 0.8540355677154583, - "grad_norm": 1.6282330236651645, + "grad_norm": 1.5808435041567788, "learning_rate": 5.165474766466772e-07, - "loss": 0.2131, + "loss": 0.2138, "step": 6243 }, { "epoch": 0.854172366621067, - "grad_norm": 1.481013943450526, + "grad_norm": 1.4767214928756653, "learning_rate": 5.155966928816886e-07, - "loss": 0.2125, + "loss": 0.2102, "step": 6244 }, { "epoch": 0.8543091655266758, - "grad_norm": 1.2724183030120277, + "grad_norm": 1.2463592339959075, "learning_rate": 5.146467373823461e-07, - "loss": 0.1868, + "loss": 0.185, "step": 6245 }, { "epoch": 0.8544459644322845, - "grad_norm": 1.3407394700276212, + "grad_norm": 1.327315192295904, "learning_rate": 5.136976103241065e-07, - "loss": 0.1708, + "loss": 0.1716, "step": 6246 }, { "epoch": 0.8545827633378933, - "grad_norm": 1.3607144752181075, + "grad_norm": 1.3414356656777047, "learning_rate": 5.127493118822724e-07, - "loss": 0.1783, + "loss": 0.1791, "step": 6247 }, { "epoch": 0.8547195622435021, - "grad_norm": 1.3179587194592732, + "grad_norm": 1.3153785400326439, "learning_rate": 5.118018422319948e-07, "loss": 0.1831, "step": 6248 }, { "epoch": 0.8548563611491108, - "grad_norm": 1.004022787801636, + "grad_norm": 0.9891440826143754, "learning_rate": 5.108552015482709e-07, "loss": 0.1395, "step": 6249 }, { "epoch": 0.8549931600547196, - "grad_norm": 1.3616738291100692, + "grad_norm": 1.378452907104707, "learning_rate": 5.099093900059421e-07, - "loss": 0.1922, + "loss": 0.1935, "step": 6250 }, { "epoch": 0.8551299589603283, - "grad_norm": 1.0317511822752548, + "grad_norm": 1.010655181923126, "learning_rate": 5.089644077797018e-07, - "loss": 0.1546, + "loss": 0.1551, "step": 6251 }, { "epoch": 0.8552667578659371, - "grad_norm": 1.3616798501025047, + "grad_norm": 1.3378217427011994, "learning_rate": 5.080202550440849e-07, - "loss": 0.184, + "loss": 0.1819, "step": 6252 }, { "epoch": 0.8554035567715458, - "grad_norm": 1.1745412585525827, + "grad_norm": 1.176274449608955, "learning_rate": 5.070769319734781e-07, - "loss": 0.1508, + "loss": 0.1512, "step": 6253 }, { "epoch": 0.8555403556771546, - "grad_norm": 1.22697442279749, + "grad_norm": 1.2163512194490007, "learning_rate": 5.061344387421097e-07, - "loss": 0.1805, + "loss": 0.1812, "step": 6254 }, { "epoch": 0.8556771545827634, - "grad_norm": 1.2238998099590295, + "grad_norm": 1.2251967831737591, "learning_rate": 5.051927755240593e-07, - "loss": 0.1629, + "loss": 0.1638, "step": 6255 }, { "epoch": 0.8558139534883721, - "grad_norm": 1.1116579028796107, + "grad_norm": 1.09926315587622, "learning_rate": 5.042519424932512e-07, - "loss": 0.1531, + "loss": 0.1533, "step": 6256 }, { "epoch": 0.8559507523939809, - "grad_norm": 1.4020572369904438, + "grad_norm": 1.3975223080554728, "learning_rate": 5.033119398234559e-07, - "loss": 0.2215, + "loss": 0.2211, "step": 6257 }, { "epoch": 0.8560875512995896, - "grad_norm": 1.4867505153168445, + "grad_norm": 1.4899005052955212, "learning_rate": 5.023727676882922e-07, - "loss": 0.2008, + "loss": 0.2006, "step": 6258 }, { "epoch": 0.8562243502051984, - "grad_norm": 0.9631796544311924, + "grad_norm": 0.9523157071376511, "learning_rate": 5.014344262612226e-07, - "loss": 0.144, + "loss": 0.1432, "step": 6259 }, { "epoch": 0.8563611491108071, - "grad_norm": 1.3341722657237365, + "grad_norm": 1.3312452032791733, "learning_rate": 5.004969157155593e-07, - "loss": 0.1615, + "loss": 0.161, "step": 6260 }, { "epoch": 0.8564979480164159, - "grad_norm": 1.2453752658450363, + "grad_norm": 1.2340928067581185, "learning_rate": 4.995602362244589e-07, - "loss": 0.174, + "loss": 0.173, "step": 6261 }, { "epoch": 0.8566347469220246, - "grad_norm": 1.4792086962762512, + "grad_norm": 1.475013181485435, "learning_rate": 4.986243879609276e-07, - "loss": 0.2112, + "loss": 0.2136, "step": 6262 }, { "epoch": 0.8567715458276334, - "grad_norm": 1.1933435882591454, + "grad_norm": 1.2222759494199784, "learning_rate": 4.976893710978126e-07, - "loss": 0.1707, + "loss": 0.1759, "step": 6263 }, { "epoch": 0.8569083447332422, - "grad_norm": 1.1791767710964896, + "grad_norm": 1.190913987514959, "learning_rate": 4.967551858078129e-07, - "loss": 0.1495, + "loss": 0.1509, "step": 6264 }, { "epoch": 0.8570451436388509, - "grad_norm": 1.2943716487182675, + "grad_norm": 1.278192878705836, "learning_rate": 4.958218322634717e-07, - "loss": 0.1964, + "loss": 0.1959, "step": 6265 }, { "epoch": 0.8571819425444597, - "grad_norm": 1.0877991636089162, + "grad_norm": 1.0821994187393282, "learning_rate": 4.948893106371771e-07, - "loss": 0.1768, + "loss": 0.1759, "step": 6266 }, { "epoch": 0.8573187414500684, - "grad_norm": 1.3013156021835868, + "grad_norm": 1.2900500228925684, "learning_rate": 4.93957621101167e-07, - "loss": 0.2029, + "loss": 0.2003, "step": 6267 }, { "epoch": 0.8574555403556772, - "grad_norm": 1.4298984245339121, + "grad_norm": 1.4285984917422763, "learning_rate": 4.930267638275222e-07, - "loss": 0.1961, + "loss": 0.1972, "step": 6268 }, { "epoch": 0.8575923392612859, - "grad_norm": 1.3015413571466976, + "grad_norm": 1.2923260749220882, "learning_rate": 4.920967389881726e-07, - "loss": 0.1844, + "loss": 0.1841, "step": 6269 }, { "epoch": 0.8577291381668947, - "grad_norm": 1.3202408388162024, + "grad_norm": 1.3326452855842246, "learning_rate": 4.91167546754891e-07, - "loss": 0.1603, + "loss": 0.1618, "step": 6270 }, { "epoch": 0.8578659370725035, - "grad_norm": 1.2431535341900062, + "grad_norm": 1.2372686503514552, "learning_rate": 4.902391872993001e-07, - "loss": 0.1783, + "loss": 0.1788, "step": 6271 }, { "epoch": 0.8580027359781122, - "grad_norm": 1.38534658162394, + "grad_norm": 1.3680421640202565, "learning_rate": 4.893116607928677e-07, - "loss": 0.1736, + "loss": 0.1725, "step": 6272 }, { "epoch": 0.858139534883721, - "grad_norm": 1.2706189892941377, + "grad_norm": 1.2588949282687374, "learning_rate": 4.883849674069058e-07, - "loss": 0.1868, + "loss": 0.1875, "step": 6273 }, { "epoch": 0.8582763337893297, - "grad_norm": 1.3300440083052145, + "grad_norm": 1.333524938045253, "learning_rate": 4.874591073125745e-07, - "loss": 0.1899, + "loss": 0.1915, "step": 6274 }, { "epoch": 0.8584131326949385, - "grad_norm": 1.0846605003864913, + "grad_norm": 1.0736358563517718, "learning_rate": 4.865340806808788e-07, - "loss": 0.1513, + "loss": 0.1506, "step": 6275 }, { "epoch": 0.8585499316005472, - "grad_norm": 1.452167521326472, + "grad_norm": 1.4167872334060179, "learning_rate": 4.856098876826709e-07, - "loss": 0.2175, + "loss": 0.2179, "step": 6276 }, { "epoch": 0.858686730506156, - "grad_norm": 1.2668984365335119, + "grad_norm": 1.2482500009756063, "learning_rate": 4.846865284886482e-07, - "loss": 0.1689, + "loss": 0.1691, "step": 6277 }, { "epoch": 0.8588235294117647, - "grad_norm": 1.3581120980937098, + "grad_norm": 1.3766289313277944, "learning_rate": 4.837640032693558e-07, - "loss": 0.1839, + "loss": 0.1867, "step": 6278 }, { "epoch": 0.8589603283173735, - "grad_norm": 1.4322260742503388, + "grad_norm": 1.4211548765197415, "learning_rate": 4.828423121951814e-07, - "loss": 0.2281, + "loss": 0.2285, "step": 6279 }, { "epoch": 0.8590971272229823, - "grad_norm": 1.231292155522024, + "grad_norm": 1.2276015271565417, "learning_rate": 4.819214554363616e-07, - "loss": 0.1938, + "loss": 0.1944, "step": 6280 }, { "epoch": 0.859233926128591, - "grad_norm": 1.2076186768496264, + "grad_norm": 1.1865574585759717, "learning_rate": 4.810014331629781e-07, - "loss": 0.1619, + "loss": 0.162, "step": 6281 }, { "epoch": 0.8593707250341998, - "grad_norm": 1.2942810787134045, + "grad_norm": 1.2859333086511768, "learning_rate": 4.800822455449572e-07, - "loss": 0.1818, + "loss": 0.1807, "step": 6282 }, { "epoch": 0.8595075239398084, - "grad_norm": 1.3191360339857303, + "grad_norm": 1.3326351555592995, "learning_rate": 4.791638927520737e-07, - "loss": 0.1738, + "loss": 0.1749, "step": 6283 }, { "epoch": 0.8596443228454173, - "grad_norm": 1.5381246661182209, + "grad_norm": 1.514055852663519, "learning_rate": 4.782463749539445e-07, - "loss": 0.1851, + "loss": 0.1837, "step": 6284 }, { "epoch": 0.859781121751026, - "grad_norm": 1.230764095096939, + "grad_norm": 1.242830765536182, "learning_rate": 4.773296923200371e-07, - "loss": 0.1667, + "loss": 0.1671, "step": 6285 }, { "epoch": 0.8599179206566347, - "grad_norm": 1.4038988304764057, + "grad_norm": 1.4081673773275718, "learning_rate": 4.7641384501965925e-07, - "loss": 0.1672, + "loss": 0.1684, "step": 6286 }, { "epoch": 0.8600547195622436, - "grad_norm": 1.2307974569277749, + "grad_norm": 1.2119980818508187, "learning_rate": 4.754988332219684e-07, - "loss": 0.1914, + "loss": 0.1913, "step": 6287 }, { "epoch": 0.8601915184678522, - "grad_norm": 1.1950668950663126, + "grad_norm": 1.1700423244469504, "learning_rate": 4.7458465709596726e-07, - "loss": 0.1391, + "loss": 0.1382, "step": 6288 }, { "epoch": 0.860328317373461, - "grad_norm": 1.434503489001539, + "grad_norm": 1.4144468310632656, "learning_rate": 4.7367131681050226e-07, - "loss": 0.2306, + "loss": 0.2304, "step": 6289 }, { "epoch": 0.8604651162790697, - "grad_norm": 1.1407666127777216, + "grad_norm": 1.1273010873234575, "learning_rate": 4.727588125342669e-07, - "loss": 0.173, + "loss": 0.1724, "step": 6290 }, { "epoch": 0.8606019151846785, - "grad_norm": 1.2819592928898067, + "grad_norm": 1.264957610625352, "learning_rate": 4.7184714443580025e-07, - "loss": 0.1695, + "loss": 0.168, "step": 6291 }, { "epoch": 0.8607387140902872, - "grad_norm": 1.2838588886019504, + "grad_norm": 1.2765961005702033, "learning_rate": 4.7093631268348705e-07, - "loss": 0.1709, + "loss": 0.1702, "step": 6292 }, { "epoch": 0.860875512995896, - "grad_norm": 1.6006775276213967, + "grad_norm": 1.5840668653139847, "learning_rate": 4.700263174455566e-07, - "loss": 0.2316, + "loss": 0.2306, "step": 6293 }, { "epoch": 0.8610123119015047, - "grad_norm": 1.2257121003042695, + "grad_norm": 1.215287307355158, "learning_rate": 4.691171588900845e-07, - "loss": 0.1651, + "loss": 0.1663, "step": 6294 }, { "epoch": 0.8611491108071135, - "grad_norm": 1.1837965976448177, + "grad_norm": 1.1869505652003796, "learning_rate": 4.682088371849908e-07, - "loss": 0.1986, + "loss": 0.1974, "step": 6295 }, { "epoch": 0.8612859097127223, - "grad_norm": 1.4167443251109588, + "grad_norm": 1.4031058280344073, "learning_rate": 4.6730135249804245e-07, - "loss": 0.2228, + "loss": 0.2214, "step": 6296 }, { "epoch": 0.861422708618331, - "grad_norm": 1.4100223663784452, + "grad_norm": 1.4009564782834716, "learning_rate": 4.66394704996852e-07, - "loss": 0.1944, + "loss": 0.1938, "step": 6297 }, { "epoch": 0.8615595075239398, - "grad_norm": 1.22076621437997, + "grad_norm": 1.1968987092161325, "learning_rate": 4.6548889484887506e-07, - "loss": 0.186, + "loss": 0.1848, "step": 6298 }, { "epoch": 0.8616963064295485, - "grad_norm": 1.2779839143657414, + "grad_norm": 1.2768027994839748, "learning_rate": 4.6458392222141545e-07, - "loss": 0.1604, + "loss": 0.1619, "step": 6299 }, { "epoch": 0.8618331053351573, - "grad_norm": 1.2359178634415067, + "grad_norm": 1.238469701627642, "learning_rate": 4.63679787281619e-07, - "loss": 0.1827, + "loss": 0.1839, "step": 6300 }, { "epoch": 0.8618331053351573, - "eval_loss": 0.17189395427703857, - "eval_runtime": 5.9387, - "eval_samples_per_second": 5.052, - "eval_steps_per_second": 1.347, + "eval_loss": 0.17226830124855042, + "eval_runtime": 5.922, + "eval_samples_per_second": 5.066, + "eval_steps_per_second": 1.351, "step": 6300 }, { "epoch": 0.861969904240766, - "grad_norm": 1.3286485329706985, + "grad_norm": 1.3032393794202755, "learning_rate": 4.627764901964804e-07, - "loss": 0.1904, + "loss": 0.1896, "step": 6301 }, { "epoch": 0.8621067031463748, - "grad_norm": 1.2867773206585553, + "grad_norm": 1.278284095160989, "learning_rate": 4.618740311328368e-07, - "loss": 0.1642, + "loss": 0.1645, "step": 6302 }, { "epoch": 0.8622435020519836, - "grad_norm": 1.4036858458217447, + "grad_norm": 1.4082451318705196, "learning_rate": 4.609724102573715e-07, - "loss": 0.1929, + "loss": 0.1953, "step": 6303 }, { "epoch": 0.8623803009575923, - "grad_norm": 1.596407616057068, + "grad_norm": 1.573151779373308, "learning_rate": 4.600716277366152e-07, - "loss": 0.2609, + "loss": 0.261, "step": 6304 }, { "epoch": 0.8625170998632011, - "grad_norm": 1.032603828140544, + "grad_norm": 1.0140221782777723, "learning_rate": 4.5917168373693864e-07, - "loss": 0.1714, + "loss": 0.1703, "step": 6305 }, { "epoch": 0.8626538987688098, - "grad_norm": 1.5619683828091349, + "grad_norm": 1.5569081009967234, "learning_rate": 4.5827257842456285e-07, - "loss": 0.2319, + "loss": 0.2311, "step": 6306 }, { "epoch": 0.8627906976744186, - "grad_norm": 1.3083595313260772, + "grad_norm": 1.2091803890562949, "learning_rate": 4.573743119655516e-07, - "loss": 0.1896, + "loss": 0.1885, "step": 6307 }, { "epoch": 0.8629274965800273, - "grad_norm": 1.4040640466558603, + "grad_norm": 1.393511472199934, "learning_rate": 4.5647688452581385e-07, - "loss": 0.1988, + "loss": 0.1981, "step": 6308 }, { "epoch": 0.8630642954856361, - "grad_norm": 1.408513394822865, + "grad_norm": 1.3896428856066216, "learning_rate": 4.5558029627110313e-07, - "loss": 0.2042, + "loss": 0.2045, "step": 6309 }, { "epoch": 0.8632010943912448, - "grad_norm": 1.2196237390367322, + "grad_norm": 1.2250443835774347, "learning_rate": 4.546845473670203e-07, - "loss": 0.1882, + "loss": 0.1892, "step": 6310 }, { "epoch": 0.8633378932968536, - "grad_norm": 1.2136044691063985, + "grad_norm": 1.207841216878591, "learning_rate": 4.5378963797900687e-07, - "loss": 0.185, + "loss": 0.1841, "step": 6311 }, { "epoch": 0.8634746922024624, - "grad_norm": 1.4754275416508633, + "grad_norm": 1.450487236767727, "learning_rate": 4.528955682723529e-07, - "loss": 0.2279, + "loss": 0.227, "step": 6312 }, { "epoch": 0.8636114911080711, - "grad_norm": 1.2529737815090451, + "grad_norm": 1.2436000645470264, "learning_rate": 4.5200233841219353e-07, - "loss": 0.1632, + "loss": 0.1648, "step": 6313 }, { "epoch": 0.8637482900136799, - "grad_norm": 0.9845501921385162, + "grad_norm": 0.9962582525360718, "learning_rate": 4.511099485635062e-07, - "loss": 0.1378, + "loss": 0.1386, "step": 6314 }, { "epoch": 0.8638850889192886, - "grad_norm": 1.3507768902965596, + "grad_norm": 1.3366565823018515, "learning_rate": 4.5021839889111575e-07, - "loss": 0.1772, + "loss": 0.1792, "step": 6315 }, { "epoch": 0.8640218878248974, - "grad_norm": 1.5128373407420757, + "grad_norm": 1.4759869531953584, "learning_rate": 4.4932768955968877e-07, - "loss": 0.2071, + "loss": 0.2053, "step": 6316 }, { "epoch": 0.8641586867305061, - "grad_norm": 1.4510904295180589, + "grad_norm": 1.4688126004204605, "learning_rate": 4.4843782073374143e-07, - "loss": 0.1931, + "loss": 0.193, "step": 6317 }, { "epoch": 0.8642954856361149, - "grad_norm": 1.0753134207822488, + "grad_norm": 1.0828216106429123, "learning_rate": 4.475487925776284e-07, - "loss": 0.1681, + "loss": 0.1696, "step": 6318 }, { "epoch": 0.8644322845417237, - "grad_norm": 1.464862830527772, + "grad_norm": 1.4533584833509818, "learning_rate": 4.466606052555544e-07, - "loss": 0.2139, + "loss": 0.2125, "step": 6319 }, { "epoch": 0.8645690834473324, - "grad_norm": 1.3661183841140458, + "grad_norm": 1.363428267716989, "learning_rate": 4.457732589315672e-07, - "loss": 0.1762, + "loss": 0.1768, "step": 6320 }, { "epoch": 0.8647058823529412, - "grad_norm": 1.2437295296118993, + "grad_norm": 1.2338149300234231, "learning_rate": 4.448867537695578e-07, - "loss": 0.1592, + "loss": 0.1593, "step": 6321 }, { "epoch": 0.8648426812585499, - "grad_norm": 1.3828588155605588, + "grad_norm": 1.3799378491152523, "learning_rate": 4.4400108993326475e-07, - "loss": 0.201, + "loss": 0.2013, "step": 6322 }, { "epoch": 0.8649794801641587, - "grad_norm": 1.2869543212388104, + "grad_norm": 1.2936772148601539, "learning_rate": 4.431162675862677e-07, - "loss": 0.1746, + "loss": 0.1771, "step": 6323 }, { "epoch": 0.8651162790697674, - "grad_norm": 1.2594139320011655, + "grad_norm": 1.2527384539129818, "learning_rate": 4.422322868919937e-07, - "loss": 0.1798, + "loss": 0.1778, "step": 6324 }, { "epoch": 0.8652530779753762, - "grad_norm": 1.143162699452894, + "grad_norm": 1.1299174828846537, "learning_rate": 4.413491480137122e-07, - "loss": 0.1341, + "loss": 0.1335, "step": 6325 }, { "epoch": 0.8653898768809849, - "grad_norm": 1.3066019841984595, + "grad_norm": 1.3060701639409384, "learning_rate": 4.4046685111453933e-07, - "loss": 0.211, + "loss": 0.2123, "step": 6326 }, { "epoch": 0.8655266757865937, - "grad_norm": 1.3102303462673939, + "grad_norm": 1.2976737817300317, "learning_rate": 4.3958539635743314e-07, - "loss": 0.1635, + "loss": 0.164, "step": 6327 }, { "epoch": 0.8656634746922025, - "grad_norm": 1.5155568150999748, + "grad_norm": 1.5433632222363645, "learning_rate": 4.387047839051989e-07, - "loss": 0.2259, + "loss": 0.2283, "step": 6328 }, { "epoch": 0.8658002735978112, - "grad_norm": 1.0847559527506148, + "grad_norm": 1.085316888687535, "learning_rate": 4.378250139204854e-07, - "loss": 0.153, + "loss": 0.1544, "step": 6329 }, { "epoch": 0.86593707250342, - "grad_norm": 1.4922038282768662, + "grad_norm": 1.4787050267393564, "learning_rate": 4.369460865657843e-07, - "loss": 0.2146, + "loss": 0.2142, "step": 6330 }, { "epoch": 0.8660738714090287, - "grad_norm": 1.4431018923724992, + "grad_norm": 1.4374803683342567, "learning_rate": 4.3606800200343357e-07, - "loss": 0.2059, + "loss": 0.2042, "step": 6331 }, { "epoch": 0.8662106703146375, - "grad_norm": 1.3355874738895002, + "grad_norm": 1.3011874939283532, "learning_rate": 4.351907603956135e-07, "loss": 0.1758, "step": 6332 }, { "epoch": 0.8663474692202462, - "grad_norm": 1.1900692853556676, + "grad_norm": 1.167258900659974, "learning_rate": 4.3431436190435105e-07, - "loss": 0.1515, + "loss": 0.1498, "step": 6333 }, { "epoch": 0.866484268125855, - "grad_norm": 1.324721487167273, + "grad_norm": 1.3045210566205143, "learning_rate": 4.3343880669151574e-07, - "loss": 0.1864, + "loss": 0.185, "step": 6334 }, { "epoch": 0.8666210670314638, - "grad_norm": 1.4050528920504137, + "grad_norm": 1.3699084658478697, "learning_rate": 4.3256409491882257e-07, - "loss": 0.183, + "loss": 0.1801, "step": 6335 }, { "epoch": 0.8667578659370725, - "grad_norm": 1.2107095585186394, + "grad_norm": 1.2280953932017615, "learning_rate": 4.3169022674782956e-07, - "loss": 0.1569, + "loss": 0.1578, "step": 6336 }, { "epoch": 0.8668946648426813, - "grad_norm": 1.3584419261237377, + "grad_norm": 1.3572378689367852, "learning_rate": 4.308172023399393e-07, - "loss": 0.1804, + "loss": 0.1815, "step": 6337 }, { "epoch": 0.86703146374829, - "grad_norm": 1.2818086384547014, + "grad_norm": 1.263203525504787, "learning_rate": 4.2994502185639997e-07, - "loss": 0.1665, + "loss": 0.1661, "step": 6338 }, { "epoch": 0.8671682626538988, - "grad_norm": 0.8936912772605834, + "grad_norm": 0.8914805119989142, "learning_rate": 4.290736854583005e-07, - "loss": 0.1159, + "loss": 0.1163, "step": 6339 }, { "epoch": 0.8673050615595075, - "grad_norm": 1.5015537389712088, + "grad_norm": 1.4908012399664388, "learning_rate": 4.2820319330657836e-07, - "loss": 0.2082, + "loss": 0.2062, "step": 6340 }, { "epoch": 0.8674418604651163, - "grad_norm": 1.1720442048738882, + "grad_norm": 1.1684878439257682, "learning_rate": 4.273335455620098e-07, - "loss": 0.1597, + "loss": 0.1588, "step": 6341 }, { "epoch": 0.867578659370725, - "grad_norm": 1.3230433952173044, + "grad_norm": 1.321579712921038, "learning_rate": 4.264647423852214e-07, - "loss": 0.1937, + "loss": 0.1951, "step": 6342 }, { "epoch": 0.8677154582763338, - "grad_norm": 1.1432374100468459, + "grad_norm": 1.1370608847952481, "learning_rate": 4.255967839366776e-07, - "loss": 0.1432, + "loss": 0.1422, "step": 6343 }, { "epoch": 0.8678522571819426, - "grad_norm": 1.261081271288874, + "grad_norm": 1.2528466375967389, "learning_rate": 4.247296703766907e-07, - "loss": 0.1713, + "loss": 0.1714, "step": 6344 }, { "epoch": 0.8679890560875513, - "grad_norm": 0.9498015926171214, + "grad_norm": 0.9253426718501421, "learning_rate": 4.23863401865417e-07, - "loss": 0.1327, + "loss": 0.1338, "step": 6345 }, { "epoch": 0.8681258549931601, - "grad_norm": 1.1134725082486057, + "grad_norm": 1.1050361424513147, "learning_rate": 4.2299797856285354e-07, - "loss": 0.1593, + "loss": 0.1602, "step": 6346 }, { "epoch": 0.8682626538987688, - "grad_norm": 1.3063628561674039, + "grad_norm": 1.2923287696363108, "learning_rate": 4.2213340062884523e-07, - "loss": 0.1933, + "loss": 0.193, "step": 6347 }, { "epoch": 0.8683994528043776, - "grad_norm": 1.009936382103348, + "grad_norm": 1.0122854161899664, "learning_rate": 4.212696682230771e-07, - "loss": 0.1405, + "loss": 0.1418, "step": 6348 }, { "epoch": 0.8685362517099863, - "grad_norm": 1.228377221426908, + "grad_norm": 1.209748940886925, "learning_rate": 4.204067815050805e-07, - "loss": 0.1745, + "loss": 0.1717, "step": 6349 }, { "epoch": 0.8686730506155951, - "grad_norm": 1.1294919397682401, + "grad_norm": 1.106047447145595, "learning_rate": 4.195447406342301e-07, - "loss": 0.1568, + "loss": 0.1565, "step": 6350 }, { "epoch": 0.8688098495212039, - "grad_norm": 1.1781527880184972, + "grad_norm": 1.1429500860239763, "learning_rate": 4.1868354576974414e-07, - "loss": 0.197, + "loss": 0.1961, "step": 6351 }, { "epoch": 0.8689466484268126, - "grad_norm": 1.2748964059247907, + "grad_norm": 1.2669811232862325, "learning_rate": 4.178231970706859e-07, - "loss": 0.1803, + "loss": 0.179, "step": 6352 }, { "epoch": 0.8690834473324214, - "grad_norm": 1.3396128968071914, + "grad_norm": 1.327278437421947, "learning_rate": 4.1696369469595875e-07, - "loss": 0.2064, + "loss": 0.2049, "step": 6353 }, { "epoch": 0.8692202462380301, - "grad_norm": 1.3895100751916187, + "grad_norm": 1.3952830041973197, "learning_rate": 4.161050388043136e-07, - "loss": 0.1673, + "loss": 0.1672, "step": 6354 }, { "epoch": 0.8693570451436389, - "grad_norm": 1.469287134702659, + "grad_norm": 1.4705543379473909, "learning_rate": 4.152472295543425e-07, - "loss": 0.1689, + "loss": 0.1692, "step": 6355 }, { "epoch": 0.8694938440492476, - "grad_norm": 1.3685000849538138, + "grad_norm": 1.3583981610945168, "learning_rate": 4.143902671044836e-07, - "loss": 0.1867, + "loss": 0.1866, "step": 6356 }, { "epoch": 0.8696306429548564, - "grad_norm": 1.3543212435722303, + "grad_norm": 1.3443764794193491, "learning_rate": 4.135341516130148e-07, - "loss": 0.1548, + "loss": 0.1535, "step": 6357 }, { "epoch": 0.8697674418604651, - "grad_norm": 1.6068217102592652, + "grad_norm": 1.5890197001998612, "learning_rate": 4.1267888323806294e-07, - "loss": 0.2112, + "loss": 0.211, "step": 6358 }, { "epoch": 0.8699042407660739, - "grad_norm": 1.3279970324482873, + "grad_norm": 1.3206197773618453, "learning_rate": 4.1182446213759275e-07, - "loss": 0.226, + "loss": 0.2259, "step": 6359 }, { "epoch": 0.8700410396716827, - "grad_norm": 1.2570071171453059, + "grad_norm": 1.249124017045342, "learning_rate": 4.109708884694158e-07, - "loss": 0.1965, + "loss": 0.1952, "step": 6360 }, { "epoch": 0.8701778385772914, - "grad_norm": 1.1966154567037166, + "grad_norm": 1.1890490200287518, "learning_rate": 4.1011816239118816e-07, - "loss": 0.1794, + "loss": 0.1786, "step": 6361 }, { "epoch": 0.8703146374829002, - "grad_norm": 1.280315949098193, + "grad_norm": 1.287308733490468, "learning_rate": 4.09266284060405e-07, - "loss": 0.192, + "loss": 0.1916, "step": 6362 }, { "epoch": 0.8704514363885089, - "grad_norm": 1.063635382646766, + "grad_norm": 1.0302265442841823, "learning_rate": 4.084152536344088e-07, - "loss": 0.1586, + "loss": 0.1579, "step": 6363 }, { "epoch": 0.8705882352941177, - "grad_norm": 1.343511826693747, + "grad_norm": 1.366532307254173, "learning_rate": 4.0756507127038494e-07, - "loss": 0.2141, + "loss": 0.2133, "step": 6364 }, { "epoch": 0.8707250341997264, - "grad_norm": 1.3527653933218777, + "grad_norm": 1.3621429735023953, "learning_rate": 4.0671573712536106e-07, - "loss": 0.1449, + "loss": 0.1466, "step": 6365 }, { "epoch": 0.8708618331053352, - "grad_norm": 1.2892423257990113, + "grad_norm": 1.2719291360726523, "learning_rate": 4.058672513562073e-07, - "loss": 0.1524, + "loss": 0.1531, "step": 6366 }, { "epoch": 0.870998632010944, - "grad_norm": 1.4068163169043315, + "grad_norm": 1.3887996858226472, "learning_rate": 4.050196141196394e-07, - "loss": 0.2392, + "loss": 0.2384, "step": 6367 }, { "epoch": 0.8711354309165527, - "grad_norm": 1.1739815037637586, + "grad_norm": 1.1754950715053627, "learning_rate": 4.0417282557221547e-07, - "loss": 0.1722, + "loss": 0.1737, "step": 6368 }, { "epoch": 0.8712722298221615, - "grad_norm": 1.4211715374702796, + "grad_norm": 1.410334417360449, "learning_rate": 4.0332688587033543e-07, - "loss": 0.1954, + "loss": 0.1961, "step": 6369 }, { "epoch": 0.8714090287277702, - "grad_norm": 1.32835861095154, + "grad_norm": 1.3526915250133371, "learning_rate": 4.0248179517024535e-07, - "loss": 0.1929, + "loss": 0.1942, "step": 6370 }, { "epoch": 0.871545827633379, - "grad_norm": 1.1762873226393322, + "grad_norm": 1.154525518810473, "learning_rate": 4.0163755362803147e-07, "loss": 0.1725, "step": 6371 }, { "epoch": 0.8716826265389876, - "grad_norm": 1.3176738388573677, + "grad_norm": 1.3145263604428465, "learning_rate": 4.0079416139962526e-07, - "loss": 0.194, + "loss": 0.1936, "step": 6372 }, { "epoch": 0.8718194254445965, - "grad_norm": 1.2978524670522074, + "grad_norm": 1.298280074439641, "learning_rate": 3.9995161864079924e-07, - "loss": 0.2067, + "loss": 0.2068, "step": 6373 }, { "epoch": 0.8719562243502051, - "grad_norm": 1.1277281759672775, + "grad_norm": 1.116947216827313, "learning_rate": 3.9910992550717286e-07, - "loss": 0.175, + "loss": 0.1742, "step": 6374 }, { "epoch": 0.872093023255814, - "grad_norm": 1.2135954299144378, + "grad_norm": 1.2072951462474895, "learning_rate": 3.9826908215420344e-07, - "loss": 0.1703, + "loss": 0.1716, "step": 6375 }, { "epoch": 0.8722298221614228, - "grad_norm": 1.3547482444091856, + "grad_norm": 1.3580087430766647, "learning_rate": 3.974290887371951e-07, - "loss": 0.1917, + "loss": 0.1919, "step": 6376 }, { "epoch": 0.8723666210670314, - "grad_norm": 1.1810116737566152, + "grad_norm": 1.170637500796478, "learning_rate": 3.9658994541129537e-07, - "loss": 0.1714, + "loss": 0.1729, "step": 6377 }, { "epoch": 0.8725034199726402, - "grad_norm": 1.3869209877537854, + "grad_norm": 1.3847203611157437, "learning_rate": 3.9575165233149083e-07, - "loss": 0.1682, + "loss": 0.1688, "step": 6378 }, { "epoch": 0.8726402188782489, - "grad_norm": 1.3359239405691374, + "grad_norm": 1.318292521546459, "learning_rate": 3.949142096526154e-07, - "loss": 0.1875, + "loss": 0.1867, "step": 6379 }, { "epoch": 0.8727770177838577, - "grad_norm": 1.2967745493760983, + "grad_norm": 1.268976241601407, "learning_rate": 3.9407761752934316e-07, - "loss": 0.1915, + "loss": 0.1902, "step": 6380 }, { "epoch": 0.8729138166894664, - "grad_norm": 1.1035346979225107, + "grad_norm": 1.0859372748396796, "learning_rate": 3.9324187611619325e-07, - "loss": 0.158, + "loss": 0.1587, "step": 6381 }, { "epoch": 0.8730506155950752, - "grad_norm": 1.1628448024465052, + "grad_norm": 1.1622809759029877, "learning_rate": 3.9240698556752444e-07, - "loss": 0.1534, + "loss": 0.156, "step": 6382 }, { "epoch": 0.873187414500684, - "grad_norm": 1.3952914185432197, + "grad_norm": 1.415594880999221, "learning_rate": 3.9157294603754113e-07, "loss": 0.172, "step": 6383 }, { "epoch": 0.8733242134062927, - "grad_norm": 1.1218830330862652, + "grad_norm": 1.1073973621046862, "learning_rate": 3.9073975768029126e-07, - "loss": 0.1613, + "loss": 0.1615, "step": 6384 }, { "epoch": 0.8734610123119015, - "grad_norm": 1.3700555701011259, + "grad_norm": 1.2845295820932239, "learning_rate": 3.899074206496617e-07, - "loss": 0.1695, + "loss": 0.1705, "step": 6385 }, { "epoch": 0.8735978112175102, - "grad_norm": 1.2411504829467586, + "grad_norm": 1.245502001030317, "learning_rate": 3.890759350993867e-07, - "loss": 0.1666, + "loss": 0.1659, "step": 6386 }, { "epoch": 0.873734610123119, - "grad_norm": 1.0923654214833831, + "grad_norm": 1.0895380072871577, "learning_rate": 3.8824530118303895e-07, - "loss": 0.1421, + "loss": 0.1434, "step": 6387 }, { "epoch": 0.8738714090287277, - "grad_norm": 1.153554036156917, + "grad_norm": 1.1346622943038194, "learning_rate": 3.8741551905403743e-07, - "loss": 0.1647, + "loss": 0.1656, "step": 6388 }, { "epoch": 0.8740082079343365, - "grad_norm": 1.4837535631530796, + "grad_norm": 1.4644699520604219, "learning_rate": 3.8658658886564e-07, - "loss": 0.2253, + "loss": 0.225, "step": 6389 }, { "epoch": 0.8741450068399452, - "grad_norm": 1.4698665947502376, + "grad_norm": 1.468844080756227, "learning_rate": 3.8575851077095216e-07, - "loss": 0.1927, + "loss": 0.1949, "step": 6390 }, { "epoch": 0.874281805745554, - "grad_norm": 1.2241124631156026, + "grad_norm": 1.2139387013519212, "learning_rate": 3.8493128492291756e-07, - "loss": 0.1493, + "loss": 0.1486, "step": 6391 }, { "epoch": 0.8744186046511628, - "grad_norm": 1.1826491953122886, + "grad_norm": 1.176220593152615, "learning_rate": 3.841049114743239e-07, - "loss": 0.184, + "loss": 0.1829, "step": 6392 }, { "epoch": 0.8745554035567715, - "grad_norm": 1.309166777293787, + "grad_norm": 1.3146597047013084, "learning_rate": 3.832793905778026e-07, - "loss": 0.1531, + "loss": 0.1552, "step": 6393 }, { "epoch": 0.8746922024623803, - "grad_norm": 1.1871178038343977, + "grad_norm": 1.1877936809231233, "learning_rate": 3.824547223858266e-07, "loss": 0.149, "step": 6394 }, { "epoch": 0.874829001367989, - "grad_norm": 1.3897625239390201, + "grad_norm": 1.3679884153223902, "learning_rate": 3.8163090705071183e-07, - "loss": 0.1843, + "loss": 0.1856, "step": 6395 }, { "epoch": 0.8749658002735978, - "grad_norm": 1.1747518170083566, + "grad_norm": 1.1707708761268838, "learning_rate": 3.808079447246149e-07, - "loss": 0.1473, + "loss": 0.1469, "step": 6396 }, { "epoch": 0.8751025991792065, - "grad_norm": 1.1398995704502952, + "grad_norm": 1.1457484551631576, "learning_rate": 3.7998583555953814e-07, - "loss": 0.1481, + "loss": 0.1479, "step": 6397 }, { "epoch": 0.8752393980848153, - "grad_norm": 1.2970739282160837, + "grad_norm": 1.282503847371902, "learning_rate": 3.791645797073229e-07, - "loss": 0.1727, + "loss": 0.1725, "step": 6398 }, { "epoch": 0.8753761969904241, - "grad_norm": 1.131954675724247, + "grad_norm": 1.112334257396752, "learning_rate": 3.7834417731965445e-07, - "loss": 0.1509, + "loss": 0.1503, "step": 6399 }, { "epoch": 0.8755129958960328, - "grad_norm": 1.2902385437617627, + "grad_norm": 1.2942235013623038, "learning_rate": 3.7752462854806215e-07, - "loss": 0.198, + "loss": 0.1997, "step": 6400 }, { "epoch": 0.8755129958960328, - "eval_loss": 0.17218397557735443, - "eval_runtime": 5.918, - "eval_samples_per_second": 5.069, - "eval_steps_per_second": 1.352, + "eval_loss": 0.17245332896709442, + "eval_runtime": 5.909, + "eval_samples_per_second": 5.077, + "eval_steps_per_second": 1.354, "step": 6400 }, { "epoch": 0.8756497948016416, - "grad_norm": 1.208759857517395, + "grad_norm": 1.2036011614824313, "learning_rate": 3.7670593354391385e-07, - "loss": 0.173, + "loss": 0.1728, "step": 6401 }, { "epoch": 0.8757865937072503, - "grad_norm": 1.1527447573774676, + "grad_norm": 1.1502626256753299, "learning_rate": 3.7588809245842405e-07, - "loss": 0.1687, + "loss": 0.1691, "step": 6402 }, { "epoch": 0.8759233926128591, - "grad_norm": 1.2332581744230129, + "grad_norm": 1.2294682726887258, "learning_rate": 3.750711054426448e-07, - "loss": 0.1956, + "loss": 0.1943, "step": 6403 }, { "epoch": 0.8760601915184678, - "grad_norm": 1.2377206613353715, + "grad_norm": 1.2196237764881426, "learning_rate": 3.7425497264747536e-07, - "loss": 0.1439, + "loss": 0.1428, "step": 6404 }, { "epoch": 0.8761969904240766, - "grad_norm": 1.1690602380868955, + "grad_norm": 1.1608765815850675, "learning_rate": 3.734396942236529e-07, - "loss": 0.1408, + "loss": 0.1413, "step": 6405 }, { "epoch": 0.8763337893296853, - "grad_norm": 1.5535653121611872, + "grad_norm": 1.5343683577834963, "learning_rate": 3.726252703217592e-07, "loss": 0.1839, "step": 6406 }, { "epoch": 0.8764705882352941, - "grad_norm": 1.281193214728568, + "grad_norm": 1.2600814840808137, "learning_rate": 3.71811701092219e-07, - "loss": 0.1861, + "loss": 0.1852, "step": 6407 }, { "epoch": 0.8766073871409029, - "grad_norm": 1.048936338107226, + "grad_norm": 1.043664024561405, "learning_rate": 3.7099898668529645e-07, - "loss": 0.1622, + "loss": 0.1621, "step": 6408 }, { "epoch": 0.8767441860465116, - "grad_norm": 1.206102523066395, + "grad_norm": 1.1958618063055826, "learning_rate": 3.701871272510993e-07, - "loss": 0.1641, + "loss": 0.1638, "step": 6409 }, { "epoch": 0.8768809849521204, - "grad_norm": 1.1043764982012305, + "grad_norm": 1.1012371438614095, "learning_rate": 3.693761229395776e-07, - "loss": 0.1496, + "loss": 0.1513, "step": 6410 }, { "epoch": 0.8770177838577291, - "grad_norm": 1.292736524485238, + "grad_norm": 1.2958777588299981, "learning_rate": 3.6856597390052427e-07, - "loss": 0.1637, + "loss": 0.1646, "step": 6411 }, { "epoch": 0.8771545827633379, - "grad_norm": 1.31846742387177, + "grad_norm": 1.3156382721899624, "learning_rate": 3.6775668028357083e-07, - "loss": 0.1602, + "loss": 0.1596, "step": 6412 }, { "epoch": 0.8772913816689466, - "grad_norm": 1.3591092789670705, + "grad_norm": 1.3599550345837002, "learning_rate": 3.669482422381959e-07, - "loss": 0.21, + "loss": 0.2129, "step": 6413 }, { "epoch": 0.8774281805745554, - "grad_norm": 1.0306203052584255, + "grad_norm": 1.031732343950683, "learning_rate": 3.6614065991371517e-07, - "loss": 0.1367, + "loss": 0.1373, "step": 6414 }, { "epoch": 0.8775649794801642, - "grad_norm": 1.416465850745954, + "grad_norm": 1.4036465853839215, "learning_rate": 3.653339334592887e-07, - "loss": 0.1875, + "loss": 0.1863, "step": 6415 }, { "epoch": 0.8777017783857729, - "grad_norm": 1.355178567035026, + "grad_norm": 1.3428503092144253, "learning_rate": 3.6452806302392006e-07, - "loss": 0.1995, + "loss": 0.1988, "step": 6416 }, { "epoch": 0.8778385772913817, - "grad_norm": 1.2879597611937776, + "grad_norm": 1.2853443482068074, "learning_rate": 3.6372304875645026e-07, - "loss": 0.1707, + "loss": 0.1725, "step": 6417 }, { "epoch": 0.8779753761969904, - "grad_norm": 1.2176912192450902, + "grad_norm": 1.206499907255014, "learning_rate": 3.6291889080556695e-07, - "loss": 0.1612, + "loss": 0.1616, "step": 6418 }, { "epoch": 0.8781121751025992, - "grad_norm": 1.051456701441881, + "grad_norm": 1.0428447455655672, "learning_rate": 3.6211558931979586e-07, - "loss": 0.1643, + "loss": 0.165, "step": 6419 }, { "epoch": 0.8782489740082079, - "grad_norm": 1.1866266187968075, + "grad_norm": 1.2028255581140694, "learning_rate": 3.6131314444750764e-07, - "loss": 0.1717, + "loss": 0.1741, "step": 6420 }, { "epoch": 0.8783857729138167, - "grad_norm": 1.181438858700884, + "grad_norm": 1.1761842410782102, "learning_rate": 3.605115563369116e-07, - "loss": 0.1689, + "loss": 0.1688, "step": 6421 }, { "epoch": 0.8785225718194254, - "grad_norm": 1.458075930291315, + "grad_norm": 1.4627685117626013, "learning_rate": 3.597108251360609e-07, - "loss": 0.2374, + "loss": 0.2345, "step": 6422 }, { "epoch": 0.8786593707250342, - "grad_norm": 1.1962484334775083, + "grad_norm": 1.1848619158370592, "learning_rate": 3.5891095099285013e-07, - "loss": 0.147, + "loss": 0.1481, "step": 6423 }, { "epoch": 0.878796169630643, - "grad_norm": 1.5360475945971601, + "grad_norm": 1.5243308119589325, "learning_rate": 3.58111934055016e-07, - "loss": 0.2034, + "loss": 0.2046, "step": 6424 }, { "epoch": 0.8789329685362517, - "grad_norm": 1.0572693597777576, + "grad_norm": 1.0540067731119347, "learning_rate": 3.5731377447013605e-07, - "loss": 0.1417, + "loss": 0.1426, "step": 6425 }, { "epoch": 0.8790697674418605, - "grad_norm": 1.4194904736787048, + "grad_norm": 1.4069932261089453, "learning_rate": 3.5651647238562904e-07, - "loss": 0.204, + "loss": 0.2032, "step": 6426 }, { "epoch": 0.8792065663474692, - "grad_norm": 1.3973452921475271, + "grad_norm": 1.3827638951793637, "learning_rate": 3.557200279487566e-07, - "loss": 0.1611, + "loss": 0.1607, "step": 6427 }, { "epoch": 0.879343365253078, - "grad_norm": 1.337456768205147, + "grad_norm": 1.3382357274579215, "learning_rate": 3.549244413066211e-07, - "loss": 0.1886, + "loss": 0.1903, "step": 6428 }, { "epoch": 0.8794801641586867, - "grad_norm": 1.2453000292481249, + "grad_norm": 1.242397992961293, "learning_rate": 3.5412971260616713e-07, - "loss": 0.1897, + "loss": 0.1889, "step": 6429 }, { "epoch": 0.8796169630642955, - "grad_norm": 1.4135985618270657, + "grad_norm": 1.3953935898644465, "learning_rate": 3.533358419941796e-07, - "loss": 0.2109, + "loss": 0.2118, "step": 6430 }, { "epoch": 0.8797537619699043, - "grad_norm": 1.38152603679021, + "grad_norm": 1.3895531799559981, "learning_rate": 3.5254282961728616e-07, - "loss": 0.1908, + "loss": 0.193, "step": 6431 }, { "epoch": 0.879890560875513, - "grad_norm": 1.1907167748309717, + "grad_norm": 1.1871251303333126, "learning_rate": 3.517506756219563e-07, "loss": 0.1667, "step": 6432 }, { "epoch": 0.8800273597811218, - "grad_norm": 1.1934112399929004, + "grad_norm": 1.2076704191881653, "learning_rate": 3.5095938015449917e-07, - "loss": 0.2014, + "loss": 0.202, "step": 6433 }, { "epoch": 0.8801641586867305, - "grad_norm": 1.1360612466907196, + "grad_norm": 1.1217732410233954, "learning_rate": 3.501689433610672e-07, - "loss": 0.155, + "loss": 0.1548, "step": 6434 }, { "epoch": 0.8803009575923393, - "grad_norm": 1.5286832772541128, + "grad_norm": 1.507968024769663, "learning_rate": 3.493793653876526e-07, - "loss": 0.1729, + "loss": 0.1732, "step": 6435 }, { "epoch": 0.880437756497948, - "grad_norm": 1.3365766361773934, + "grad_norm": 1.323156270995239, "learning_rate": 3.485906463800903e-07, - "loss": 0.1757, + "loss": 0.1755, "step": 6436 }, { "epoch": 0.8805745554035568, - "grad_norm": 1.1963214108562898, + "grad_norm": 1.1773285148131247, "learning_rate": 3.4780278648405606e-07, - "loss": 0.1844, + "loss": 0.1864, "step": 6437 }, { "epoch": 0.8807113543091655, - "grad_norm": 1.1568213591562593, + "grad_norm": 1.14716452895294, "learning_rate": 3.4701578584506736e-07, - "loss": 0.1958, + "loss": 0.1956, "step": 6438 }, { "epoch": 0.8808481532147743, - "grad_norm": 1.3411981622147107, + "grad_norm": 1.3430851002353923, "learning_rate": 3.462296446084812e-07, - "loss": 0.1751, + "loss": 0.176, "step": 6439 }, { "epoch": 0.8809849521203831, - "grad_norm": 1.2225789003254883, + "grad_norm": 1.2200959416230233, "learning_rate": 3.454443629194987e-07, - "loss": 0.1705, + "loss": 0.1714, "step": 6440 }, { "epoch": 0.8811217510259918, - "grad_norm": 1.1904237483053695, + "grad_norm": 1.1845740142866639, "learning_rate": 3.4465994092316047e-07, - "loss": 0.1864, + "loss": 0.187, "step": 6441 }, { "epoch": 0.8812585499316006, - "grad_norm": 1.2643669278059695, + "grad_norm": 1.2473773208511827, "learning_rate": 3.438763787643479e-07, - "loss": 0.1908, + "loss": 0.191, "step": 6442 }, { "epoch": 0.8813953488372093, - "grad_norm": 1.1701621864726788, + "grad_norm": 1.1538454364524322, "learning_rate": 3.430936765877857e-07, - "loss": 0.1473, + "loss": 0.1472, "step": 6443 }, { "epoch": 0.8815321477428181, - "grad_norm": 1.2489052984126803, + "grad_norm": 1.254820540459443, "learning_rate": 3.4231183453803605e-07, - "loss": 0.1997, + "loss": 0.1998, "step": 6444 }, { "epoch": 0.8816689466484268, - "grad_norm": 1.1978479835155351, + "grad_norm": 1.1811698264587818, "learning_rate": 3.415308527595068e-07, - "loss": 0.1633, + "loss": 0.1635, "step": 6445 }, { "epoch": 0.8818057455540356, - "grad_norm": 1.5333262683724687, + "grad_norm": 1.5356447838233607, "learning_rate": 3.407507313964431e-07, - "loss": 0.1868, + "loss": 0.186, "step": 6446 }, { "epoch": 0.8819425444596444, - "grad_norm": 1.2337015304722183, + "grad_norm": 1.2322914438194938, "learning_rate": 3.39971470592933e-07, - "loss": 0.1803, + "loss": 0.1798, "step": 6447 }, { "epoch": 0.8820793433652531, - "grad_norm": 0.9353915726898654, + "grad_norm": 0.8895254098374832, "learning_rate": 3.391930704929064e-07, - "loss": 0.1628, + "loss": 0.1625, "step": 6448 }, { "epoch": 0.8822161422708619, - "grad_norm": 1.219518940953667, + "grad_norm": 1.2157081873082718, "learning_rate": 3.3841553124013217e-07, - "loss": 0.1651, + "loss": 0.1661, "step": 6449 }, { "epoch": 0.8823529411764706, - "grad_norm": 1.0453678753256739, + "grad_norm": 1.03639745704852, "learning_rate": 3.3763885297822153e-07, - "loss": 0.1572, + "loss": 0.158, "step": 6450 }, { "epoch": 0.8824897400820794, - "grad_norm": 1.4530248959554495, + "grad_norm": 1.447254112344199, "learning_rate": 3.3686303585062586e-07, - "loss": 0.1744, + "loss": 0.1751, "step": 6451 }, { "epoch": 0.8826265389876881, - "grad_norm": 1.2594914039504232, + "grad_norm": 1.2561543384051774, "learning_rate": 3.360880800006383e-07, - "loss": 0.1683, + "loss": 0.1702, "step": 6452 }, { "epoch": 0.8827633378932969, - "grad_norm": 1.4885717462271149, + "grad_norm": 1.481532597298852, "learning_rate": 3.353139855713927e-07, - "loss": 0.1874, + "loss": 0.1893, "step": 6453 }, { "epoch": 0.8829001367989056, - "grad_norm": 1.4485423494329703, + "grad_norm": 1.4380054682495929, "learning_rate": 3.3454075270586416e-07, - "loss": 0.2027, + "loss": 0.2017, "step": 6454 }, { "epoch": 0.8830369357045144, - "grad_norm": 1.054022767630115, + "grad_norm": 1.0744137854145652, "learning_rate": 3.3376838154686677e-07, - "loss": 0.1766, + "loss": 0.1813, "step": 6455 }, { "epoch": 0.8831737346101232, - "grad_norm": 1.3490055287479312, + "grad_norm": 1.321274464721852, "learning_rate": 3.3299687223705743e-07, - "loss": 0.2109, + "loss": 0.2095, "step": 6456 }, { "epoch": 0.8833105335157319, - "grad_norm": 1.2070914710718992, + "grad_norm": 1.197837680351852, "learning_rate": 3.322262249189351e-07, - "loss": 0.1477, + "loss": 0.1482, "step": 6457 }, { "epoch": 0.8834473324213407, - "grad_norm": 1.1116509221725186, + "grad_norm": 1.1006742727227319, "learning_rate": 3.314564397348347e-07, - "loss": 0.1623, + "loss": 0.162, "step": 6458 }, { "epoch": 0.8835841313269494, - "grad_norm": 1.2650226602326733, + "grad_norm": 1.2455916427598084, "learning_rate": 3.3068751682693687e-07, - "loss": 0.1625, + "loss": 0.1605, "step": 6459 }, { "epoch": 0.8837209302325582, - "grad_norm": 1.3556255509911874, + "grad_norm": 1.3498884620122094, "learning_rate": 3.299194563372604e-07, - "loss": 0.2005, + "loss": 0.2002, "step": 6460 }, { "epoch": 0.8838577291381668, - "grad_norm": 1.230855242760536, + "grad_norm": 1.2325464591546833, "learning_rate": 3.291522584076656e-07, - "loss": 0.1638, + "loss": 0.1648, "step": 6461 }, { "epoch": 0.8839945280437757, - "grad_norm": 1.3855010637443566, + "grad_norm": 1.2490307958664417, "learning_rate": 3.2838592317985307e-07, - "loss": 0.198, + "loss": 0.1965, "step": 6462 }, { "epoch": 0.8841313269493845, - "grad_norm": 1.4670527379920542, + "grad_norm": 1.47699432044498, "learning_rate": 3.2762045079536453e-07, - "loss": 0.1747, + "loss": 0.176, "step": 6463 }, { "epoch": 0.8842681258549931, - "grad_norm": 1.5164697666048894, + "grad_norm": 1.5263539781845783, "learning_rate": 3.2685584139558244e-07, - "loss": 0.2079, + "loss": 0.2081, "step": 6464 }, { "epoch": 0.884404924760602, - "grad_norm": 1.259405799589087, + "grad_norm": 1.2507759824112517, "learning_rate": 3.2609209512172833e-07, - "loss": 0.2026, + "loss": 0.202, "step": 6465 }, { "epoch": 0.8845417236662106, - "grad_norm": 1.4870347968894446, + "grad_norm": 1.4423668341224452, "learning_rate": 3.253292121148666e-07, - "loss": 0.1624, + "loss": 0.1586, "step": 6466 }, { "epoch": 0.8846785225718194, - "grad_norm": 1.3352281310587908, + "grad_norm": 1.3204899812284252, "learning_rate": 3.2456719251590107e-07, - "loss": 0.1992, + "loss": 0.1991, "step": 6467 }, { "epoch": 0.8848153214774281, - "grad_norm": 1.0363442586238742, + "grad_norm": 1.0246853251327206, "learning_rate": 3.2380603646557654e-07, - "loss": 0.1386, + "loss": 0.1397, "step": 6468 }, { "epoch": 0.8849521203830369, - "grad_norm": 1.5046177113699537, + "grad_norm": 1.4768287610724304, "learning_rate": 3.2304574410447653e-07, - "loss": 0.2203, + "loss": 0.2214, "step": 6469 }, { "epoch": 0.8850889192886456, - "grad_norm": 1.260698977622805, + "grad_norm": 1.2636163118361594, "learning_rate": 3.2228631557302767e-07, - "loss": 0.1795, + "loss": 0.1802, "step": 6470 }, { "epoch": 0.8852257181942544, - "grad_norm": 1.2664681865123437, + "grad_norm": 1.2583838603639477, "learning_rate": 3.21527751011495e-07, - "loss": 0.162, + "loss": 0.1622, "step": 6471 }, { "epoch": 0.8853625170998632, - "grad_norm": 1.4086588990899898, + "grad_norm": 1.3758789465551298, "learning_rate": 3.2077005055998534e-07, - "loss": 0.2087, + "loss": 0.2083, "step": 6472 }, { "epoch": 0.8854993160054719, - "grad_norm": 1.2479707343270872, + "grad_norm": 1.2364650377616981, "learning_rate": 3.20013214358445e-07, - "loss": 0.1869, + "loss": 0.1849, "step": 6473 }, { "epoch": 0.8856361149110807, - "grad_norm": 1.0764677592414609, + "grad_norm": 1.0639632028096766, "learning_rate": 3.192572425466611e-07, - "loss": 0.1895, + "loss": 0.1887, "step": 6474 }, { "epoch": 0.8857729138166894, - "grad_norm": 1.2496896160953073, + "grad_norm": 1.2451467801624552, "learning_rate": 3.1850213526426143e-07, - "loss": 0.1676, + "loss": 0.1691, "step": 6475 }, { "epoch": 0.8859097127222982, - "grad_norm": 1.0340754355715247, + "grad_norm": 1.0284877597542201, "learning_rate": 3.177478926507127e-07, - "loss": 0.1614, + "loss": 0.1615, "step": 6476 }, { "epoch": 0.8860465116279069, - "grad_norm": 1.1941135771096985, + "grad_norm": 1.1733584477156207, "learning_rate": 3.1699451484532464e-07, - "loss": 0.1629, + "loss": 0.1621, "step": 6477 }, { "epoch": 0.8861833105335157, - "grad_norm": 1.103517989651517, + "grad_norm": 1.111188325786725, "learning_rate": 3.162420019872431e-07, - "loss": 0.1733, + "loss": 0.1745, "step": 6478 }, { "epoch": 0.8863201094391245, - "grad_norm": 1.3199548312312055, + "grad_norm": 1.318408670166573, "learning_rate": 3.1549035421545807e-07, - "loss": 0.217, + "loss": 0.2166, "step": 6479 }, { "epoch": 0.8864569083447332, - "grad_norm": 1.0988226998901789, + "grad_norm": 1.0950069766404023, "learning_rate": 3.14739571668799e-07, - "loss": 0.1492, + "loss": 0.147, "step": 6480 }, { "epoch": 0.886593707250342, - "grad_norm": 1.2213422016396576, + "grad_norm": 1.2220148347011495, "learning_rate": 3.139896544859328e-07, - "loss": 0.1622, + "loss": 0.1625, "step": 6481 }, { "epoch": 0.8867305061559507, - "grad_norm": 1.2351971869139624, + "grad_norm": 1.2334765817814335, "learning_rate": 3.132406028053703e-07, - "loss": 0.178, + "loss": 0.1781, "step": 6482 }, { "epoch": 0.8868673050615595, - "grad_norm": 1.6362951969160262, + "grad_norm": 1.627799942984853, "learning_rate": 3.124924167654597e-07, - "loss": 0.1997, + "loss": 0.1974, "step": 6483 }, { "epoch": 0.8870041039671682, - "grad_norm": 1.2853598651027625, + "grad_norm": 1.3281006268596078, "learning_rate": 3.117450965043911e-07, - "loss": 0.1836, + "loss": 0.1845, "step": 6484 }, { "epoch": 0.887140902872777, - "grad_norm": 1.3498052366491977, + "grad_norm": 1.3169515567347951, "learning_rate": 3.109986421601935e-07, - "loss": 0.2456, + "loss": 0.2461, "step": 6485 }, { "epoch": 0.8872777017783857, - "grad_norm": 1.3638938635307256, + "grad_norm": 1.3591173611771308, "learning_rate": 3.1025305387073714e-07, - "loss": 0.2014, + "loss": 0.2006, "step": 6486 }, { "epoch": 0.8874145006839945, - "grad_norm": 1.2329954607046245, + "grad_norm": 1.2350318481018394, "learning_rate": 3.095083317737302e-07, - "loss": 0.1731, + "loss": 0.1729, "step": 6487 }, { "epoch": 0.8875512995896033, - "grad_norm": 1.2676506863138144, + "grad_norm": 1.2812836564426469, "learning_rate": 3.087644760067232e-07, - "loss": 0.1652, + "loss": 0.1655, "step": 6488 }, { "epoch": 0.887688098495212, - "grad_norm": 1.3916566880991204, + "grad_norm": 1.3943147267185596, "learning_rate": 3.0802148670710577e-07, - "loss": 0.1984, + "loss": 0.1997, "step": 6489 }, { "epoch": 0.8878248974008208, - "grad_norm": 1.2285293261838248, + "grad_norm": 1.2235282578737894, "learning_rate": 3.072793640121069e-07, - "loss": 0.1848, + "loss": 0.185, "step": 6490 }, { "epoch": 0.8879616963064295, - "grad_norm": 1.4948024681215148, + "grad_norm": 1.4753919165678189, "learning_rate": 3.065381080587965e-07, - "loss": 0.2252, + "loss": 0.2258, "step": 6491 }, { "epoch": 0.8880984952120383, - "grad_norm": 1.1592346145792833, + "grad_norm": 1.1576625766329283, "learning_rate": 3.057977189840833e-07, - "loss": 0.1582, + "loss": 0.1579, "step": 6492 }, { "epoch": 0.888235294117647, - "grad_norm": 0.9986754976163144, + "grad_norm": 0.9798007601015885, "learning_rate": 3.0505819692471797e-07, - "loss": 0.1522, + "loss": 0.1499, "step": 6493 }, { "epoch": 0.8883720930232558, - "grad_norm": 1.347788398772732, + "grad_norm": 1.3167339103105271, "learning_rate": 3.043195420172879e-07, - "loss": 0.2205, + "loss": 0.2187, "step": 6494 }, { "epoch": 0.8885088919288646, - "grad_norm": 1.6539783657923894, + "grad_norm": 1.6360911157878577, "learning_rate": 3.035817543982228e-07, - "loss": 0.2256, + "loss": 0.2275, "step": 6495 }, { "epoch": 0.8886456908344733, - "grad_norm": 1.28895939889791, + "grad_norm": 1.2733478994971814, "learning_rate": 3.02844834203791e-07, - "loss": 0.1928, + "loss": 0.1929, "step": 6496 }, { "epoch": 0.8887824897400821, - "grad_norm": 1.0784555645543537, + "grad_norm": 1.1104010221320038, "learning_rate": 3.021087815701018e-07, - "loss": 0.153, + "loss": 0.1535, "step": 6497 }, { "epoch": 0.8889192886456908, - "grad_norm": 1.3682087413894832, + "grad_norm": 1.354818394662715, "learning_rate": 3.0137359663310437e-07, - "loss": 0.1713, + "loss": 0.1705, "step": 6498 }, { "epoch": 0.8890560875512996, - "grad_norm": 1.3401899541666111, + "grad_norm": 1.3237771305346306, "learning_rate": 3.006392795285845e-07, - "loss": 0.1781, + "loss": 0.1791, "step": 6499 }, { "epoch": 0.8891928864569083, - "grad_norm": 0.9131487087306801, + "grad_norm": 0.8966803236756434, "learning_rate": 2.99905830392172e-07, - "loss": 0.1494, + "loss": 0.1485, "step": 6500 }, { "epoch": 0.8891928864569083, - "eval_loss": 0.17202956974506378, - "eval_runtime": 5.9251, - "eval_samples_per_second": 5.063, + "eval_loss": 0.17232516407966614, + "eval_runtime": 5.9273, + "eval_samples_per_second": 5.061, "eval_steps_per_second": 1.35, "step": 6500 }, { "epoch": 0.8893296853625171, - "grad_norm": 1.1439580365791595, + "grad_norm": 1.1378258037249478, "learning_rate": 2.99173249359333e-07, - "loss": 0.163, + "loss": 0.1635, "step": 6501 }, { "epoch": 0.8894664842681258, - "grad_norm": 1.3168783881585897, + "grad_norm": 1.3045979206312739, "learning_rate": 2.9844153656537536e-07, - "loss": 0.1897, + "loss": 0.1886, "step": 6502 }, { "epoch": 0.8896032831737346, - "grad_norm": 1.4693681338473974, + "grad_norm": 1.4445642953614293, "learning_rate": 2.977106921454448e-07, - "loss": 0.2131, + "loss": 0.2124, "step": 6503 }, { "epoch": 0.8897400820793434, - "grad_norm": 1.1283785043042505, + "grad_norm": 1.143021006000101, "learning_rate": 2.96980716234529e-07, - "loss": 0.175, + "loss": 0.1763, "step": 6504 }, { "epoch": 0.8898768809849521, - "grad_norm": 1.146355975117592, + "grad_norm": 1.1404734805119514, "learning_rate": 2.962516089674533e-07, - "loss": 0.1629, + "loss": 0.1615, "step": 6505 }, { "epoch": 0.8900136798905609, - "grad_norm": 1.1268769089732813, + "grad_norm": 1.106709644952763, "learning_rate": 2.9552337047888345e-07, "loss": 0.1682, "step": 6506 }, { "epoch": 0.8901504787961696, - "grad_norm": 1.2717068390785138, + "grad_norm": 1.2688364301064088, "learning_rate": 2.947960009033246e-07, - "loss": 0.1598, + "loss": 0.1595, "step": 6507 }, { "epoch": 0.8902872777017784, - "grad_norm": 1.086504632834933, + "grad_norm": 1.0701549663342307, "learning_rate": 2.9406950037511984e-07, - "loss": 0.145, + "loss": 0.1457, "step": 6508 }, { "epoch": 0.8904240766073871, - "grad_norm": 1.220415637432788, + "grad_norm": 1.2002877391091384, "learning_rate": 2.9334386902845513e-07, - "loss": 0.1524, + "loss": 0.1519, "step": 6509 }, { "epoch": 0.8905608755129959, - "grad_norm": 1.2966789509301606, + "grad_norm": 1.284411033871939, "learning_rate": 2.9261910699735384e-07, - "loss": 0.1687, + "loss": 0.1685, "step": 6510 }, { "epoch": 0.8906976744186047, - "grad_norm": 1.1190835321158858, + "grad_norm": 1.086608765299174, "learning_rate": 2.9189521441567725e-07, - "loss": 0.1725, + "loss": 0.1701, "step": 6511 }, { "epoch": 0.8908344733242134, - "grad_norm": 1.2094147440591259, + "grad_norm": 1.186731442223717, "learning_rate": 2.911721914171295e-07, - "loss": 0.1607, + "loss": 0.1602, "step": 6512 }, { "epoch": 0.8909712722298222, - "grad_norm": 1.3277666624498479, + "grad_norm": 1.336544935229294, "learning_rate": 2.904500381352515e-07, - "loss": 0.1757, + "loss": 0.1778, "step": 6513 }, { "epoch": 0.8911080711354309, - "grad_norm": 1.2870821293196038, + "grad_norm": 1.2799426155179374, "learning_rate": 2.89728754703425e-07, "loss": 0.1835, "step": 6514 }, { "epoch": 0.8912448700410397, - "grad_norm": 1.459109957836024, + "grad_norm": 1.4396135522547262, "learning_rate": 2.8900834125486997e-07, - "loss": 0.2386, + "loss": 0.2415, "step": 6515 }, { "epoch": 0.8913816689466484, - "grad_norm": 1.2063318520532917, + "grad_norm": 1.1790499363178748, "learning_rate": 2.8828879792264676e-07, - "loss": 0.1768, + "loss": 0.1766, "step": 6516 }, { "epoch": 0.8915184678522572, - "grad_norm": 1.1660583693254558, + "grad_norm": 1.1726618539363032, "learning_rate": 2.8757012483965286e-07, - "loss": 0.1354, + "loss": 0.1379, "step": 6517 }, { "epoch": 0.8916552667578659, - "grad_norm": 0.9652403906767719, + "grad_norm": 0.9613228713317592, "learning_rate": 2.868523221386288e-07, - "loss": 0.1316, + "loss": 0.133, "step": 6518 }, { "epoch": 0.8917920656634747, - "grad_norm": 1.3007239872028815, + "grad_norm": 1.3001550277018608, "learning_rate": 2.8613538995215016e-07, - "loss": 0.1706, + "loss": 0.171, "step": 6519 }, { "epoch": 0.8919288645690835, - "grad_norm": 1.0947010973202143, + "grad_norm": 1.0865167102205593, "learning_rate": 2.854193284126344e-07, - "loss": 0.1618, + "loss": 0.161, "step": 6520 }, { "epoch": 0.8920656634746922, - "grad_norm": 1.1018523264065532, + "grad_norm": 1.1009437813897522, "learning_rate": 2.8470413765233906e-07, - "loss": 0.1813, + "loss": 0.1792, "step": 6521 }, { "epoch": 0.892202462380301, - "grad_norm": 1.052461973551417, + "grad_norm": 1.0610204245222457, "learning_rate": 2.8398981780335677e-07, - "loss": 0.1424, + "loss": 0.1425, "step": 6522 }, { "epoch": 0.8923392612859097, - "grad_norm": 1.3278600252108375, + "grad_norm": 1.3087700448022663, "learning_rate": 2.832763689976242e-07, - "loss": 0.1962, + "loss": 0.1972, "step": 6523 }, { "epoch": 0.8924760601915185, - "grad_norm": 1.1414439836496808, + "grad_norm": 1.142074783502735, "learning_rate": 2.825637913669121e-07, - "loss": 0.1591, + "loss": 0.159, "step": 6524 }, { "epoch": 0.8926128590971272, - "grad_norm": 1.151246179144753, + "grad_norm": 1.1428783055088658, "learning_rate": 2.8185208504283524e-07, - "loss": 0.1469, + "loss": 0.1472, "step": 6525 }, { "epoch": 0.892749658002736, - "grad_norm": 1.1132419933080586, + "grad_norm": 1.1054784931593358, "learning_rate": 2.811412501568439e-07, - "loss": 0.1647, + "loss": 0.1645, "step": 6526 }, { "epoch": 0.8928864569083448, - "grad_norm": 1.281461537455121, + "grad_norm": 1.2727493568618797, "learning_rate": 2.804312868402298e-07, - "loss": 0.183, + "loss": 0.1828, "step": 6527 }, { "epoch": 0.8930232558139535, - "grad_norm": 1.316320838626682, + "grad_norm": 1.3415129408081847, "learning_rate": 2.7972219522412194e-07, - "loss": 0.1526, + "loss": 0.1564, "step": 6528 }, { "epoch": 0.8931600547195623, - "grad_norm": 1.208976886376714, + "grad_norm": 1.2198502132631388, "learning_rate": 2.7901397543948884e-07, - "loss": 0.1867, + "loss": 0.1862, "step": 6529 }, { "epoch": 0.893296853625171, - "grad_norm": 1.0136164681206699, + "grad_norm": 1.0136173822727597, "learning_rate": 2.783066276171392e-07, - "loss": 0.1339, + "loss": 0.1354, "step": 6530 }, { "epoch": 0.8934336525307798, - "grad_norm": 1.337519383452924, + "grad_norm": 1.3494857351165506, "learning_rate": 2.776001518877175e-07, - "loss": 0.181, + "loss": 0.1814, "step": 6531 }, { "epoch": 0.8935704514363885, - "grad_norm": 1.2676192676245717, + "grad_norm": 1.2540686560823529, "learning_rate": 2.768945483817115e-07, - "loss": 0.1529, + "loss": 0.153, "step": 6532 }, { "epoch": 0.8937072503419973, - "grad_norm": 1.167689063504089, + "grad_norm": 1.1609057271427106, "learning_rate": 2.761898172294436e-07, - "loss": 0.1811, + "loss": 0.182, "step": 6533 }, { "epoch": 0.893844049247606, - "grad_norm": 1.4084991179832966, + "grad_norm": 1.388399371471558, "learning_rate": 2.7548595856107917e-07, - "loss": 0.1855, + "loss": 0.1853, "step": 6534 }, { "epoch": 0.8939808481532148, - "grad_norm": 1.5956289895011122, + "grad_norm": 1.5927621370230844, "learning_rate": 2.747829725066181e-07, - "loss": 0.1918, + "loss": 0.1931, "step": 6535 }, { "epoch": 0.8941176470588236, - "grad_norm": 1.3296179048669707, + "grad_norm": 1.3263516477840862, "learning_rate": 2.7408085919590265e-07, - "loss": 0.198, + "loss": 0.1984, "step": 6536 }, { "epoch": 0.8942544459644323, - "grad_norm": 1.4870554678862116, + "grad_norm": 1.46093441875165, "learning_rate": 2.7337961875861295e-07, - "loss": 0.2037, + "loss": 0.203, "step": 6537 }, { "epoch": 0.8943912448700411, - "grad_norm": 1.2808407477744828, + "grad_norm": 1.289780168237301, "learning_rate": 2.72679251324266e-07, - "loss": 0.1823, + "loss": 0.1826, "step": 6538 }, { "epoch": 0.8945280437756498, - "grad_norm": 1.294396159867778, + "grad_norm": 1.2972734975778182, "learning_rate": 2.719797570222199e-07, - "loss": 0.1746, + "loss": 0.1748, "step": 6539 }, { "epoch": 0.8946648426812586, - "grad_norm": 1.1786432235664632, + "grad_norm": 1.1619621555317643, "learning_rate": 2.712811359816714e-07, - "loss": 0.1639, + "loss": 0.1638, "step": 6540 }, { "epoch": 0.8948016415868673, - "grad_norm": 1.2721513451563558, + "grad_norm": 1.2664965499633734, "learning_rate": 2.70583388331655e-07, - "loss": 0.1931, + "loss": 0.1937, "step": 6541 }, { "epoch": 0.8949384404924761, - "grad_norm": 1.392405799441014, + "grad_norm": 1.380050202177304, "learning_rate": 2.6988651420104316e-07, - "loss": 0.1851, + "loss": 0.1868, "step": 6542 }, { "epoch": 0.8950752393980849, - "grad_norm": 1.1931532061314234, + "grad_norm": 1.1957802238127726, "learning_rate": 2.6919051371854853e-07, - "loss": 0.1617, + "loss": 0.1634, "step": 6543 }, { "epoch": 0.8952120383036936, - "grad_norm": 1.501195546245891, + "grad_norm": 1.4695613498730902, "learning_rate": 2.684953870127227e-07, - "loss": 0.2307, + "loss": 0.231, "step": 6544 }, { "epoch": 0.8953488372093024, - "grad_norm": 1.5325470394467111, + "grad_norm": 1.5146579540518632, "learning_rate": 2.67801134211953e-07, - "loss": 0.1978, + "loss": 0.1983, "step": 6545 }, { "epoch": 0.895485636114911, - "grad_norm": 1.258725668874082, + "grad_norm": 1.2573140694425917, "learning_rate": 2.6710775544446955e-07, - "loss": 0.1989, + "loss": 0.1977, "step": 6546 }, { "epoch": 0.8956224350205199, - "grad_norm": 1.3787115890196764, + "grad_norm": 1.3855262710225122, "learning_rate": 2.664152508383372e-07, - "loss": 0.2058, + "loss": 0.2064, "step": 6547 }, { "epoch": 0.8957592339261286, - "grad_norm": 1.2508552034363145, + "grad_norm": 1.2138104955615736, "learning_rate": 2.657236205214625e-07, - "loss": 0.1887, + "loss": 0.1885, "step": 6548 }, { "epoch": 0.8958960328317374, - "grad_norm": 1.4574003245638307, + "grad_norm": 1.4479145601141696, "learning_rate": 2.650328646215872e-07, - "loss": 0.1547, + "loss": 0.1554, "step": 6549 }, { "epoch": 0.896032831737346, - "grad_norm": 1.3047031390854427, + "grad_norm": 1.2855332491876263, "learning_rate": 2.6434298326629527e-07, - "loss": 0.1612, + "loss": 0.1595, "step": 6550 }, { "epoch": 0.8961696306429549, - "grad_norm": 1.310292908867656, + "grad_norm": 1.2900760964528284, "learning_rate": 2.636539765830054e-07, - "loss": 0.1885, + "loss": 0.1871, "step": 6551 }, { "epoch": 0.8963064295485637, - "grad_norm": 1.4641106100858146, + "grad_norm": 1.454422660645556, "learning_rate": 2.6296584469897745e-07, - "loss": 0.2009, + "loss": 0.2007, "step": 6552 }, { "epoch": 0.8964432284541723, - "grad_norm": 1.0085330855463817, + "grad_norm": 1.0012433128597822, "learning_rate": 2.622785877413092e-07, - "loss": 0.1426, + "loss": 0.1399, "step": 6553 }, { "epoch": 0.8965800273597812, - "grad_norm": 1.266586560305722, + "grad_norm": 1.2840690111228872, "learning_rate": 2.615922058369358e-07, - "loss": 0.1701, + "loss": 0.174, "step": 6554 }, { "epoch": 0.8967168262653898, - "grad_norm": 1.4112465364315705, + "grad_norm": 1.3886087835805965, "learning_rate": 2.609066991126313e-07, - "loss": 0.1977, + "loss": 0.1973, "step": 6555 }, { "epoch": 0.8968536251709986, - "grad_norm": 1.2462710269348791, + "grad_norm": 1.2235849591141612, "learning_rate": 2.6022206769500844e-07, - "loss": 0.1942, + "loss": 0.1925, "step": 6556 }, { "epoch": 0.8969904240766073, - "grad_norm": 1.2096777143369293, + "grad_norm": 1.2041418490975482, "learning_rate": 2.5953831171051937e-07, - "loss": 0.1779, + "loss": 0.176, "step": 6557 }, { "epoch": 0.8971272229822161, - "grad_norm": 1.298093392422079, + "grad_norm": 1.3073969213365344, "learning_rate": 2.588554312854513e-07, - "loss": 0.1576, + "loss": 0.1583, "step": 6558 }, { "epoch": 0.897264021887825, - "grad_norm": 1.49164866085705, + "grad_norm": 1.4870124543831795, "learning_rate": 2.5817342654593235e-07, - "loss": 0.2004, + "loss": 0.2006, "step": 6559 }, { "epoch": 0.8974008207934336, - "grad_norm": 1.3281473392950003, + "grad_norm": 1.316735521039481, "learning_rate": 2.5749229761792947e-07, - "loss": 0.1778, + "loss": 0.1779, "step": 6560 }, { "epoch": 0.8975376196990424, - "grad_norm": 1.1179052723075782, + "grad_norm": 1.1076440979043063, "learning_rate": 2.568120446272449e-07, "loss": 0.1881, "step": 6561 }, { "epoch": 0.8976744186046511, - "grad_norm": 1.3258265964723293, + "grad_norm": 1.3144822232972593, "learning_rate": 2.5613266769952183e-07, "loss": 0.1748, "step": 6562 }, { "epoch": 0.8978112175102599, - "grad_norm": 1.003812793974673, + "grad_norm": 1.0024379858268664, "learning_rate": 2.5545416696024006e-07, - "loss": 0.1475, + "loss": 0.1488, "step": 6563 }, { "epoch": 0.8979480164158686, - "grad_norm": 1.2801710048073465, + "grad_norm": 1.2820747094033949, "learning_rate": 2.5477654253471873e-07, - "loss": 0.1612, + "loss": 0.1604, "step": 6564 }, { "epoch": 0.8980848153214774, - "grad_norm": 0.988823136346243, + "grad_norm": 1.3596524820337093, "learning_rate": 2.540997945481133e-07, - "loss": 0.145, + "loss": 0.1497, "step": 6565 }, { "epoch": 0.8982216142270861, - "grad_norm": 1.2483556204331276, + "grad_norm": 1.239890328514093, "learning_rate": 2.534239231254204e-07, - "loss": 0.1567, + "loss": 0.1554, "step": 6566 }, { "epoch": 0.8983584131326949, - "grad_norm": 1.2980040047037584, + "grad_norm": 1.2968096436116223, "learning_rate": 2.527489283914714e-07, - "loss": 0.1843, + "loss": 0.1838, "step": 6567 }, { "epoch": 0.8984952120383037, - "grad_norm": 1.432050802583, + "grad_norm": 1.4190343203325653, "learning_rate": 2.520748104709375e-07, - "loss": 0.1455, + "loss": 0.1452, "step": 6568 }, { "epoch": 0.8986320109439124, - "grad_norm": 1.1036925553928618, + "grad_norm": 1.0863113738540928, "learning_rate": 2.5140156948832817e-07, - "loss": 0.1818, + "loss": 0.1813, "step": 6569 }, { "epoch": 0.8987688098495212, - "grad_norm": 1.3461464670667591, + "grad_norm": 1.346006370497435, "learning_rate": 2.5072920556799054e-07, - "loss": 0.1431, + "loss": 0.1437, "step": 6570 }, { "epoch": 0.8989056087551299, - "grad_norm": 1.1897689708625052, + "grad_norm": 1.1973714732654952, "learning_rate": 2.500577188341097e-07, - "loss": 0.1675, + "loss": 0.1696, "step": 6571 }, { "epoch": 0.8990424076607387, - "grad_norm": 1.3240103642727612, + "grad_norm": 1.3390147175472364, "learning_rate": 2.493871094107081e-07, - "loss": 0.1813, + "loss": 0.1815, "step": 6572 }, { "epoch": 0.8991792065663474, - "grad_norm": 1.403238058067694, + "grad_norm": 1.4004721015264887, "learning_rate": 2.487173774216484e-07, - "loss": 0.194, + "loss": 0.1929, "step": 6573 }, { "epoch": 0.8993160054719562, - "grad_norm": 1.4624127887428975, + "grad_norm": 1.4775365440154755, "learning_rate": 2.480485229906271e-07, - "loss": 0.2158, + "loss": 0.2161, "step": 6574 }, { "epoch": 0.899452804377565, - "grad_norm": 1.4685132037137028, + "grad_norm": 1.4567468698494248, "learning_rate": 2.4738054624118215e-07, - "loss": 0.2134, + "loss": 0.2132, "step": 6575 }, { "epoch": 0.8995896032831737, - "grad_norm": 1.2654087610750335, + "grad_norm": 1.2560760919485914, "learning_rate": 2.467134472966892e-07, - "loss": 0.1557, + "loss": 0.1535, "step": 6576 }, { "epoch": 0.8997264021887825, - "grad_norm": 1.172521571299731, + "grad_norm": 1.1380673375382075, "learning_rate": 2.460472262803593e-07, - "loss": 0.1721, + "loss": 0.1684, "step": 6577 }, { "epoch": 0.8998632010943912, - "grad_norm": 1.6214993051752324, + "grad_norm": 1.6027532023234146, "learning_rate": 2.453818833152449e-07, - "loss": 0.2195, + "loss": 0.2176, "step": 6578 }, { "epoch": 0.9, - "grad_norm": 1.3272312463373785, + "grad_norm": 1.3123268479397783, "learning_rate": 2.447174185242324e-07, - "loss": 0.1871, + "loss": 0.1908, "step": 6579 }, { "epoch": 0.9001367989056087, - "grad_norm": 1.1359241387310115, + "grad_norm": 1.11417527892663, "learning_rate": 2.4405383203004897e-07, - "loss": 0.154, + "loss": 0.1551, "step": 6580 }, { "epoch": 0.9002735978112175, - "grad_norm": 1.1131209944036304, + "grad_norm": 1.1156157780387088, "learning_rate": 2.4339112395525777e-07, - "loss": 0.1761, + "loss": 0.1755, "step": 6581 }, { "epoch": 0.9004103967168262, - "grad_norm": 1.3562323489427053, + "grad_norm": 1.3492556662571962, "learning_rate": 2.427292944222609e-07, - "loss": 0.202, + "loss": 0.2025, "step": 6582 }, { "epoch": 0.900547195622435, - "grad_norm": 0.9990878563857412, + "grad_norm": 0.9881895148998813, "learning_rate": 2.420683435532989e-07, - "loss": 0.1303, + "loss": 0.1298, "step": 6583 }, { "epoch": 0.9006839945280438, - "grad_norm": 1.2717949066629548, + "grad_norm": 1.244739219617457, "learning_rate": 2.414082714704463e-07, - "loss": 0.1773, + "loss": 0.1764, "step": 6584 }, { "epoch": 0.9008207934336525, - "grad_norm": 1.3100259566609986, + "grad_norm": 1.3207779715847827, "learning_rate": 2.4074907829561955e-07, - "loss": 0.1657, + "loss": 0.1668, "step": 6585 }, { "epoch": 0.9009575923392613, - "grad_norm": 1.3362381677987256, + "grad_norm": 1.3260167946532606, "learning_rate": 2.4009076415057063e-07, - "loss": 0.1757, + "loss": 0.1763, "step": 6586 }, { "epoch": 0.90109439124487, - "grad_norm": 1.3497234645402882, + "grad_norm": 1.3366550245442068, "learning_rate": 2.394333291568901e-07, - "loss": 0.2446, + "loss": 0.243, "step": 6587 }, { "epoch": 0.9012311901504788, - "grad_norm": 1.1441486304419552, + "grad_norm": 1.1549862542348468, "learning_rate": 2.3877677343600525e-07, - "loss": 0.167, + "loss": 0.1673, "step": 6588 }, { "epoch": 0.9013679890560875, - "grad_norm": 1.2208276185124098, + "grad_norm": 1.2015512223481204, "learning_rate": 2.3812109710918241e-07, - "loss": 0.1946, + "loss": 0.1942, "step": 6589 }, { "epoch": 0.9015047879616963, - "grad_norm": 1.3068412286037023, + "grad_norm": 1.3124434925766486, "learning_rate": 2.3746630029752249e-07, - "loss": 0.1956, + "loss": 0.1951, "step": 6590 }, { "epoch": 0.9016415868673051, - "grad_norm": 1.3963170829477531, + "grad_norm": 1.4054962841117442, "learning_rate": 2.3681238312196653e-07, "loss": 0.2118, "step": 6591 }, { "epoch": 0.9017783857729138, - "grad_norm": 1.190438165062434, + "grad_norm": 1.175703378719626, "learning_rate": 2.3615934570329402e-07, - "loss": 0.1792, + "loss": 0.1804, "step": 6592 }, { "epoch": 0.9019151846785226, - "grad_norm": 1.399597014140663, + "grad_norm": 1.3497396034747189, "learning_rate": 2.3550718816211905e-07, - "loss": 0.1834, + "loss": 0.1818, "step": 6593 }, { "epoch": 0.9020519835841313, - "grad_norm": 1.3043227703970028, + "grad_norm": 1.3031740158622425, "learning_rate": 2.3485591061889522e-07, - "loss": 0.1815, + "loss": 0.1833, "step": 6594 }, { "epoch": 0.9021887824897401, - "grad_norm": 1.1485629934057298, + "grad_norm": 1.1412659613322513, "learning_rate": 2.3420551319391195e-07, - "loss": 0.1412, + "loss": 0.1422, "step": 6595 }, { "epoch": 0.9023255813953488, - "grad_norm": 1.0238537214636654, + "grad_norm": 1.0276636255546205, "learning_rate": 2.3355599600729916e-07, - "loss": 0.1483, + "loss": 0.1493, "step": 6596 }, { "epoch": 0.9024623803009576, - "grad_norm": 1.326183458375413, + "grad_norm": 1.3316163986284566, "learning_rate": 2.329073591790193e-07, - "loss": 0.2184, + "loss": 0.22, "step": 6597 }, { "epoch": 0.9025991792065663, - "grad_norm": 1.3374190220811357, + "grad_norm": 1.3131736260962332, "learning_rate": 2.3225960282887706e-07, - "loss": 0.1727, + "loss": 0.1732, "step": 6598 }, { "epoch": 0.9027359781121751, - "grad_norm": 1.1316731137984415, + "grad_norm": 1.1275010178008615, "learning_rate": 2.3161272707651173e-07, - "loss": 0.178, + "loss": 0.1794, "step": 6599 }, { "epoch": 0.9028727770177839, - "grad_norm": 1.3502062626234987, + "grad_norm": 1.3551427646240908, "learning_rate": 2.3096673204140108e-07, - "loss": 0.1831, + "loss": 0.1837, "step": 6600 }, { "epoch": 0.9028727770177839, - "eval_loss": 0.1718291938304901, - "eval_runtime": 5.9405, - "eval_samples_per_second": 5.05, - "eval_steps_per_second": 1.347, + "eval_loss": 0.17214997112751007, + "eval_runtime": 5.934, + "eval_samples_per_second": 5.056, + "eval_steps_per_second": 1.348, "step": 6600 }, { "epoch": 0.9030095759233926, - "grad_norm": 1.1356372258232894, + "grad_norm": 1.1204236685428959, "learning_rate": 2.3032161784286078e-07, - "loss": 0.1636, + "loss": 0.1628, "step": 6601 }, { "epoch": 0.9031463748290014, - "grad_norm": 1.1221400260818608, + "grad_norm": 1.1167063328394464, "learning_rate": 2.2967738460004107e-07, - "loss": 0.1599, + "loss": 0.1602, "step": 6602 }, { "epoch": 0.9032831737346101, - "grad_norm": 1.1437147199841833, + "grad_norm": 1.1353732603628932, "learning_rate": 2.290340324319329e-07, - "loss": 0.1503, + "loss": 0.15, "step": 6603 }, { "epoch": 0.9034199726402189, - "grad_norm": 1.413960462314624, + "grad_norm": 1.388940566299882, "learning_rate": 2.2839156145736173e-07, - "loss": 0.1961, + "loss": 0.1955, "step": 6604 }, { "epoch": 0.9035567715458276, - "grad_norm": 1.3449541580452113, + "grad_norm": 1.3665584641235602, "learning_rate": 2.2774997179499215e-07, - "loss": 0.1756, + "loss": 0.1762, "step": 6605 }, { "epoch": 0.9036935704514364, - "grad_norm": 1.2666912852869967, + "grad_norm": 1.245364271326381, "learning_rate": 2.271092635633243e-07, - "loss": 0.1466, + "loss": 0.1474, "step": 6606 }, { "epoch": 0.9038303693570452, - "grad_norm": 1.6272242379403248, + "grad_norm": 1.6309002201407363, "learning_rate": 2.2646943688069746e-07, - "loss": 0.2344, + "loss": 0.2375, "step": 6607 }, { "epoch": 0.9039671682626539, - "grad_norm": 1.4596601620321357, + "grad_norm": 1.4552953250062104, "learning_rate": 2.2583049186528705e-07, - "loss": 0.2111, + "loss": 0.2123, "step": 6608 }, { "epoch": 0.9041039671682627, - "grad_norm": 1.2628971174372725, + "grad_norm": 1.2325240039785368, "learning_rate": 2.2519242863510537e-07, - "loss": 0.169, + "loss": 0.1695, "step": 6609 }, { "epoch": 0.9042407660738714, - "grad_norm": 1.2210042830226462, + "grad_norm": 1.1868731970165176, "learning_rate": 2.245552473080026e-07, - "loss": 0.1926, + "loss": 0.192, "step": 6610 }, { "epoch": 0.9043775649794802, - "grad_norm": 1.250266483976779, + "grad_norm": 1.2447766861756924, "learning_rate": 2.2391894800166403e-07, - "loss": 0.184, + "loss": 0.1815, "step": 6611 }, { "epoch": 0.9045143638850889, - "grad_norm": 1.4703232038279272, + "grad_norm": 1.465950427766163, "learning_rate": 2.2328353083361565e-07, - "loss": 0.1816, + "loss": 0.1828, "step": 6612 }, { "epoch": 0.9046511627906977, - "grad_norm": 1.329947687193746, + "grad_norm": 1.3350971898366133, "learning_rate": 2.2264899592121747e-07, - "loss": 0.1971, + "loss": 0.201, "step": 6613 }, { "epoch": 0.9047879616963064, - "grad_norm": 1.1807711523505058, + "grad_norm": 1.199421790006426, "learning_rate": 2.2201534338166796e-07, - "loss": 0.1541, + "loss": 0.1547, "step": 6614 }, { "epoch": 0.9049247606019152, - "grad_norm": 1.3496094935049534, + "grad_norm": 1.3447639571367531, "learning_rate": 2.2138257333200185e-07, - "loss": 0.2172, + "loss": 0.218, "step": 6615 }, { "epoch": 0.905061559507524, - "grad_norm": 1.2716885384330703, + "grad_norm": 1.2781499093334914, "learning_rate": 2.2075068588909121e-07, - "loss": 0.1544, + "loss": 0.1551, "step": 6616 }, { "epoch": 0.9051983584131327, - "grad_norm": 1.4217379246115847, + "grad_norm": 1.4302880849212678, "learning_rate": 2.2011968116964656e-07, - "loss": 0.1806, + "loss": 0.1829, "step": 6617 }, { "epoch": 0.9053351573187415, - "grad_norm": 1.180370675387836, + "grad_norm": 1.1776106317243997, "learning_rate": 2.1948955929021186e-07, - "loss": 0.1716, + "loss": 0.1721, "step": 6618 }, { "epoch": 0.9054719562243502, - "grad_norm": 1.5253832064790218, + "grad_norm": 1.4981504000016352, "learning_rate": 2.188603203671713e-07, - "loss": 0.1868, + "loss": 0.1858, "step": 6619 }, { "epoch": 0.905608755129959, - "grad_norm": 1.1939747997851853, + "grad_norm": 1.187424873323709, "learning_rate": 2.1823196451674412e-07, - "loss": 0.1574, + "loss": 0.1556, "step": 6620 }, { "epoch": 0.9057455540355677, - "grad_norm": 1.2505195878290087, + "grad_norm": 1.229227896961087, "learning_rate": 2.1760449185498855e-07, - "loss": 0.1741, + "loss": 0.1746, "step": 6621 }, { "epoch": 0.9058823529411765, - "grad_norm": 1.236668601177416, + "grad_norm": 1.2230178456731415, "learning_rate": 2.1697790249779638e-07, - "loss": 0.1592, + "loss": 0.1594, "step": 6622 }, { "epoch": 0.9060191518467853, - "grad_norm": 1.3556767788166315, + "grad_norm": 1.362250203606649, "learning_rate": 2.163521965608989e-07, - "loss": 0.1673, + "loss": 0.166, "step": 6623 }, { "epoch": 0.906155950752394, - "grad_norm": 1.1943014992058494, + "grad_norm": 1.1908486260397482, "learning_rate": 2.1572737415986422e-07, - "loss": 0.1528, + "loss": 0.1549, "step": 6624 }, { "epoch": 0.9062927496580028, - "grad_norm": 1.0905737229380363, + "grad_norm": 1.0832450660569717, "learning_rate": 2.151034354100956e-07, - "loss": 0.1575, + "loss": 0.1584, "step": 6625 }, { "epoch": 0.9064295485636115, - "grad_norm": 1.2582414141327984, + "grad_norm": 1.2412211108042444, "learning_rate": 2.1448038042683528e-07, - "loss": 0.1707, + "loss": 0.1703, "step": 6626 }, { "epoch": 0.9065663474692203, - "grad_norm": 1.077151342583014, + "grad_norm": 1.0692150148251967, "learning_rate": 2.13858209325159e-07, - "loss": 0.1354, + "loss": 0.1342, "step": 6627 }, { "epoch": 0.906703146374829, - "grad_norm": 0.978190395170064, + "grad_norm": 1.0684753528747257, "learning_rate": 2.132369222199826e-07, - "loss": 0.1467, + "loss": 0.1483, "step": 6628 }, { "epoch": 0.9068399452804378, - "grad_norm": 1.236277890254843, + "grad_norm": 1.226484998853972, "learning_rate": 2.1261651922605653e-07, - "loss": 0.178, + "loss": 0.1776, "step": 6629 }, { "epoch": 0.9069767441860465, - "grad_norm": 1.0787498412384904, + "grad_norm": 1.0713004105574415, "learning_rate": 2.1199700045797077e-07, - "loss": 0.1668, + "loss": 0.1666, "step": 6630 }, { "epoch": 0.9071135430916553, - "grad_norm": 1.163376115663058, + "grad_norm": 1.1488704843121007, "learning_rate": 2.1137836603014715e-07, - "loss": 0.1514, + "loss": 0.152, "step": 6631 }, { "epoch": 0.9072503419972641, - "grad_norm": 1.3120022671714715, + "grad_norm": 1.2919486203036719, "learning_rate": 2.1076061605684817e-07, - "loss": 0.156, + "loss": 0.1569, "step": 6632 }, { "epoch": 0.9073871409028728, - "grad_norm": 1.1107848720036948, + "grad_norm": 1.0964654901331932, "learning_rate": 2.1014375065217251e-07, - "loss": 0.1781, + "loss": 0.179, "step": 6633 }, { "epoch": 0.9075239398084816, - "grad_norm": 1.1048657497690546, + "grad_norm": 1.0975648002539784, "learning_rate": 2.0952776993005354e-07, - "loss": 0.138, + "loss": 0.1365, "step": 6634 }, { "epoch": 0.9076607387140903, - "grad_norm": 1.2955060534977454, + "grad_norm": 1.2818357703897953, "learning_rate": 2.089126740042635e-07, - "loss": 0.171, + "loss": 0.1702, "step": 6635 }, { "epoch": 0.9077975376196991, - "grad_norm": 1.2830431414054873, + "grad_norm": 1.2820440403629467, "learning_rate": 2.0829846298840885e-07, - "loss": 0.1845, + "loss": 0.1868, "step": 6636 }, { "epoch": 0.9079343365253078, - "grad_norm": 1.389124753570074, + "grad_norm": 1.3040406966199802, "learning_rate": 2.0768513699593484e-07, - "loss": 0.1913, + "loss": 0.1903, "step": 6637 }, { "epoch": 0.9080711354309166, - "grad_norm": 1.5895172134083264, + "grad_norm": 1.5579117513213743, "learning_rate": 2.0707269614012148e-07, - "loss": 0.2359, + "loss": 0.2326, "step": 6638 }, { "epoch": 0.9082079343365254, - "grad_norm": 1.1293134881679237, + "grad_norm": 1.118107481725641, "learning_rate": 2.064611405340866e-07, "loss": 0.175, "step": 6639 }, { "epoch": 0.908344733242134, - "grad_norm": 1.2958090122721442, + "grad_norm": 1.3113469833499767, "learning_rate": 2.058504702907843e-07, - "loss": 0.2012, + "loss": 0.2054, "step": 6640 }, { "epoch": 0.9084815321477429, - "grad_norm": 1.221917373985218, + "grad_norm": 1.2123540082343947, "learning_rate": 2.0524068552300436e-07, - "loss": 0.1892, + "loss": 0.19, "step": 6641 }, { "epoch": 0.9086183310533515, - "grad_norm": 1.3162300440856483, + "grad_norm": 1.306960045893431, "learning_rate": 2.0463178634337333e-07, - "loss": 0.1863, + "loss": 0.1881, "step": 6642 }, { "epoch": 0.9087551299589604, - "grad_norm": 1.1830452285080346, + "grad_norm": 1.1965849133749684, "learning_rate": 2.040237728643546e-07, - "loss": 0.1932, + "loss": 0.1943, "step": 6643 }, { "epoch": 0.908891928864569, - "grad_norm": 1.3636355102813775, + "grad_norm": 1.348493083867293, "learning_rate": 2.034166451982489e-07, - "loss": 0.245, + "loss": 0.2461, "step": 6644 }, { "epoch": 0.9090287277701778, - "grad_norm": 1.1629809183654087, + "grad_norm": 1.1375684282596485, "learning_rate": 2.0281040345719038e-07, - "loss": 0.1748, + "loss": 0.1755, "step": 6645 }, { "epoch": 0.9091655266757865, - "grad_norm": 1.4027854071794577, + "grad_norm": 1.3971929583717035, "learning_rate": 2.0220504775315342e-07, - "loss": 0.1844, + "loss": 0.1846, "step": 6646 }, { "epoch": 0.9093023255813953, - "grad_norm": 1.2790423068225594, + "grad_norm": 1.2761763527878105, "learning_rate": 2.0160057819794465e-07, - "loss": 0.1847, + "loss": 0.1856, "step": 6647 }, { "epoch": 0.9094391244870041, - "grad_norm": 1.1290702290088244, + "grad_norm": 1.1160047223958305, "learning_rate": 2.0099699490320977e-07, - "loss": 0.1608, + "loss": 0.1605, "step": 6648 }, { "epoch": 0.9095759233926128, - "grad_norm": 1.2053635311094582, + "grad_norm": 1.248845409649262, "learning_rate": 2.003942979804313e-07, - "loss": 0.184, + "loss": 0.1841, "step": 6649 }, { "epoch": 0.9097127222982216, - "grad_norm": 1.2639624019712685, + "grad_norm": 1.2439846475798289, "learning_rate": 1.9979248754092518e-07, - "loss": 0.1934, + "loss": 0.192, "step": 6650 }, { "epoch": 0.9098495212038303, - "grad_norm": 1.3797559507470016, + "grad_norm": 1.356867150158509, "learning_rate": 1.9919156369584691e-07, - "loss": 0.1467, + "loss": 0.1469, "step": 6651 }, { "epoch": 0.9099863201094391, - "grad_norm": 1.4751370725173123, + "grad_norm": 1.473565572948444, "learning_rate": 1.9859152655618498e-07, - "loss": 0.1857, + "loss": 0.186, "step": 6652 }, { "epoch": 0.9101231190150478, - "grad_norm": 1.3176908275532984, + "grad_norm": 1.31908475035308, "learning_rate": 1.9799237623276736e-07, - "loss": 0.1794, + "loss": 0.1803, "step": 6653 }, { "epoch": 0.9102599179206566, - "grad_norm": 1.4646019344400296, + "grad_norm": 1.4683390970105283, "learning_rate": 1.97394112836255e-07, - "loss": 0.179, + "loss": 0.1799, "step": 6654 }, { "epoch": 0.9103967168262654, - "grad_norm": 1.25081669236559, + "grad_norm": 1.2813310740494992, "learning_rate": 1.967967364771478e-07, - "loss": 0.1883, + "loss": 0.1904, "step": 6655 }, { "epoch": 0.9105335157318741, - "grad_norm": 1.2405042654715994, + "grad_norm": 1.2391776177488647, "learning_rate": 1.962002472657809e-07, - "loss": 0.1832, + "loss": 0.1844, "step": 6656 }, { "epoch": 0.9106703146374829, - "grad_norm": 1.5623472701655934, + "grad_norm": 1.5307883955886459, "learning_rate": 1.9560464531232394e-07, - "loss": 0.2283, + "loss": 0.2241, "step": 6657 }, { "epoch": 0.9108071135430916, - "grad_norm": 1.2328587985760648, + "grad_norm": 1.2198156837251297, "learning_rate": 1.9500993072678554e-07, - "loss": 0.1597, + "loss": 0.1594, "step": 6658 }, { "epoch": 0.9109439124487004, - "grad_norm": 1.4395464964320654, + "grad_norm": 1.4337453771414352, "learning_rate": 1.944161036190084e-07, - "loss": 0.2243, + "loss": 0.2238, "step": 6659 }, { "epoch": 0.9110807113543091, - "grad_norm": 1.1314582436987952, + "grad_norm": 1.1145300466020522, "learning_rate": 1.9382316409867263e-07, - "loss": 0.1788, + "loss": 0.1785, "step": 6660 }, { "epoch": 0.9112175102599179, - "grad_norm": 1.2600905554799848, + "grad_norm": 1.2494869907071713, "learning_rate": 1.9323111227529222e-07, - "loss": 0.1467, + "loss": 0.1461, "step": 6661 }, { "epoch": 0.9113543091655266, - "grad_norm": 1.4209577981795662, + "grad_norm": 1.4180465115364513, "learning_rate": 1.926399482582203e-07, - "loss": 0.2249, + "loss": 0.2257, "step": 6662 }, { "epoch": 0.9114911080711354, - "grad_norm": 1.4562819414311932, + "grad_norm": 1.439544829396661, "learning_rate": 1.920496721566434e-07, - "loss": 0.1661, + "loss": 0.1659, "step": 6663 }, { "epoch": 0.9116279069767442, - "grad_norm": 1.1267827340245964, + "grad_norm": 1.1095481010836978, "learning_rate": 1.9146028407958483e-07, - "loss": 0.1729, + "loss": 0.1715, "step": 6664 }, { "epoch": 0.9117647058823529, - "grad_norm": 1.3222235551952526, + "grad_norm": 1.321603318165845, "learning_rate": 1.908717841359048e-07, - "loss": 0.1843, + "loss": 0.1835, "step": 6665 }, { "epoch": 0.9119015047879617, - "grad_norm": 1.3407352783076043, + "grad_norm": 1.3536886413144937, "learning_rate": 1.9028417243429854e-07, - "loss": 0.1591, + "loss": 0.1608, "step": 6666 }, { "epoch": 0.9120383036935704, - "grad_norm": 1.0045414377060509, + "grad_norm": 0.9968893116328496, "learning_rate": 1.8969744908329756e-07, - "loss": 0.142, + "loss": 0.1416, "step": 6667 }, { "epoch": 0.9121751025991792, - "grad_norm": 1.2470225516423663, + "grad_norm": 1.2363102228125191, "learning_rate": 1.8911161419126855e-07, - "loss": 0.1744, + "loss": 0.1742, "step": 6668 }, { "epoch": 0.9123119015047879, - "grad_norm": 1.2854358455272537, + "grad_norm": 1.27262239136974, "learning_rate": 1.8852666786641548e-07, - "loss": 0.1709, + "loss": 0.1701, "step": 6669 }, { "epoch": 0.9124487004103967, - "grad_norm": 1.277803367820505, + "grad_norm": 1.267969365116534, "learning_rate": 1.8794261021677695e-07, - "loss": 0.1734, + "loss": 0.1737, "step": 6670 }, { "epoch": 0.9125854993160055, - "grad_norm": 1.1620647053638795, + "grad_norm": 1.1544448188322765, "learning_rate": 1.8735944135022776e-07, - "loss": 0.1723, + "loss": 0.1731, "step": 6671 }, { "epoch": 0.9127222982216142, - "grad_norm": 1.2802608873586994, + "grad_norm": 1.2565008275198495, "learning_rate": 1.8677716137447955e-07, - "loss": 0.202, + "loss": 0.2023, "step": 6672 }, { "epoch": 0.912859097127223, - "grad_norm": 1.1935439555059628, + "grad_norm": 1.1609376874954387, "learning_rate": 1.8619577039707848e-07, - "loss": 0.1698, + "loss": 0.1679, "step": 6673 }, { "epoch": 0.9129958960328317, - "grad_norm": 1.0432052634870428, + "grad_norm": 1.0399814609177238, "learning_rate": 1.8561526852540755e-07, - "loss": 0.1114, + "loss": 0.1107, "step": 6674 }, { "epoch": 0.9131326949384405, - "grad_norm": 1.0469918358350465, + "grad_norm": 1.0510660288223277, "learning_rate": 1.8503565586668383e-07, - "loss": 0.1565, + "loss": 0.1577, "step": 6675 }, { "epoch": 0.9132694938440492, - "grad_norm": 1.4113271035689114, + "grad_norm": 1.410885646255033, "learning_rate": 1.8445693252796272e-07, - "loss": 0.2226, + "loss": 0.2217, "step": 6676 }, { "epoch": 0.913406292749658, - "grad_norm": 0.9758712852059582, + "grad_norm": 0.9714212532428023, "learning_rate": 1.8387909861613207e-07, - "loss": 0.0976, + "loss": 0.0987, "step": 6677 }, { "epoch": 0.9135430916552667, - "grad_norm": 1.269144659834294, + "grad_norm": 1.2802568994310273, "learning_rate": 1.8330215423791987e-07, - "loss": 0.208, + "loss": 0.2093, "step": 6678 }, { "epoch": 0.9136798905608755, - "grad_norm": 1.0674705176427794, + "grad_norm": 1.0698759818584307, "learning_rate": 1.8272609949988473e-07, - "loss": 0.15, + "loss": 0.1527, "step": 6679 }, { "epoch": 0.9138166894664843, - "grad_norm": 0.9560054882716683, + "grad_norm": 0.9512392475350826, "learning_rate": 1.8215093450842435e-07, - "loss": 0.1467, + "loss": 0.1439, "step": 6680 }, { "epoch": 0.913953488372093, - "grad_norm": 1.139240368299498, + "grad_norm": 1.1298933281961607, "learning_rate": 1.8157665936977265e-07, - "loss": 0.1478, + "loss": 0.1467, "step": 6681 }, { "epoch": 0.9140902872777018, - "grad_norm": 1.351293419698306, + "grad_norm": 1.3390283388156414, "learning_rate": 1.8100327418999585e-07, - "loss": 0.167, + "loss": 0.1677, "step": 6682 }, { "epoch": 0.9142270861833105, - "grad_norm": 1.3466667762205058, + "grad_norm": 1.3366209759137244, "learning_rate": 1.804307790749993e-07, - "loss": 0.1841, + "loss": 0.1847, "step": 6683 }, { "epoch": 0.9143638850889193, - "grad_norm": 1.2862935917257223, + "grad_norm": 1.284952454519962, "learning_rate": 1.7985917413052057e-07, - "loss": 0.2502, + "loss": 0.2525, "step": 6684 }, { "epoch": 0.914500683994528, - "grad_norm": 1.213928778164143, + "grad_norm": 1.1975691372469428, "learning_rate": 1.792884594621358e-07, - "loss": 0.1878, + "loss": 0.1864, "step": 6685 }, { "epoch": 0.9146374829001368, - "grad_norm": 1.178658735227782, + "grad_norm": 1.1564043245966074, "learning_rate": 1.7871863517525623e-07, - "loss": 0.1798, + "loss": 0.1794, "step": 6686 }, { "epoch": 0.9147742818057456, - "grad_norm": 1.189815542020329, + "grad_norm": 1.1878851339748482, "learning_rate": 1.78149701375126e-07, - "loss": 0.142, + "loss": 0.144, "step": 6687 }, { "epoch": 0.9149110807113543, - "grad_norm": 1.1704013015278598, + "grad_norm": 1.1522477799792523, "learning_rate": 1.7758165816682826e-07, - "loss": 0.2029, + "loss": 0.2016, "step": 6688 }, { "epoch": 0.9150478796169631, - "grad_norm": 1.2395617240795826, + "grad_norm": 1.247732579890899, "learning_rate": 1.7701450565527967e-07, - "loss": 0.1741, + "loss": 0.175, "step": 6689 }, { "epoch": 0.9151846785225718, - "grad_norm": 2.091552960659292, + "grad_norm": 1.177499523353616, "learning_rate": 1.7644824394523308e-07, - "loss": 0.164, + "loss": 0.17, "step": 6690 }, { "epoch": 0.9153214774281806, - "grad_norm": 1.2532740099102677, + "grad_norm": 1.2396368385001864, "learning_rate": 1.7588287314127594e-07, - "loss": 0.1557, + "loss": 0.1546, "step": 6691 }, { "epoch": 0.9154582763337893, - "grad_norm": 1.070866896251328, + "grad_norm": 1.066826474384984, "learning_rate": 1.7531839334783308e-07, - "loss": 0.172, + "loss": 0.1721, "step": 6692 }, { "epoch": 0.9155950752393981, - "grad_norm": 1.1618358322625881, + "grad_norm": 1.1647605759925712, "learning_rate": 1.7475480466916162e-07, - "loss": 0.1832, + "loss": 0.1849, "step": 6693 }, { "epoch": 0.9157318741450068, - "grad_norm": 1.3588445388747166, + "grad_norm": 1.3197469929600447, "learning_rate": 1.7419210720935777e-07, - "loss": 0.2045, + "loss": 0.1991, "step": 6694 }, { "epoch": 0.9158686730506156, - "grad_norm": 1.3015145087604973, + "grad_norm": 1.2629938038921513, "learning_rate": 1.7363030107234946e-07, - "loss": 0.1704, + "loss": 0.1687, "step": 6695 }, { "epoch": 0.9160054719562244, - "grad_norm": 1.278731907651132, + "grad_norm": 1.2563641717241123, "learning_rate": 1.7306938636190263e-07, - "loss": 0.1837, + "loss": 0.1821, "step": 6696 }, { "epoch": 0.9161422708618331, - "grad_norm": 1.352782571561698, + "grad_norm": 1.3567869623802968, "learning_rate": 1.725093631816188e-07, - "loss": 0.184, + "loss": 0.1845, "step": 6697 }, { "epoch": 0.9162790697674419, - "grad_norm": 1.431296554146198, + "grad_norm": 1.4264728760893806, "learning_rate": 1.7195023163493253e-07, - "loss": 0.1822, + "loss": 0.1839, "step": 6698 }, { "epoch": 0.9164158686730506, - "grad_norm": 1.2314138530124685, + "grad_norm": 1.228649487540794, "learning_rate": 1.7139199182511557e-07, - "loss": 0.1832, + "loss": 0.1838, "step": 6699 }, { "epoch": 0.9165526675786594, - "grad_norm": 1.2965246610791947, + "grad_norm": 1.2962912036201868, "learning_rate": 1.7083464385527326e-07, - "loss": 0.1776, + "loss": 0.1809, "step": 6700 }, { "epoch": 0.9165526675786594, - "eval_loss": 0.17162227630615234, - "eval_runtime": 5.9258, - "eval_samples_per_second": 5.063, + "eval_loss": 0.17202959954738617, + "eval_runtime": 5.9274, + "eval_samples_per_second": 5.061, "eval_steps_per_second": 1.35, "step": 6700 }, { "epoch": 0.9166894664842681, - "grad_norm": 1.1004437220349605, + "grad_norm": 1.096194951697977, "learning_rate": 1.702781878283488e-07, - "loss": 0.1938, + "loss": 0.193, "step": 6701 }, { "epoch": 0.9168262653898769, - "grad_norm": 1.1247029465563236, + "grad_norm": 1.1402370612635802, "learning_rate": 1.6972262384711834e-07, - "loss": 0.1494, + "loss": 0.151, "step": 6702 }, { "epoch": 0.9169630642954857, - "grad_norm": 1.458455377141889, + "grad_norm": 1.4381299552072582, "learning_rate": 1.6916795201419366e-07, - "loss": 0.1976, + "loss": 0.1975, "step": 6703 }, { "epoch": 0.9170998632010944, - "grad_norm": 1.0785140094708316, + "grad_norm": 1.0668607995787396, "learning_rate": 1.686141724320245e-07, - "loss": 0.1455, + "loss": 0.1466, "step": 6704 }, { "epoch": 0.9172366621067032, - "grad_norm": 1.374967495601841, + "grad_norm": 1.3724216083084835, "learning_rate": 1.6806128520289066e-07, - "loss": 0.2068, + "loss": 0.2076, "step": 6705 }, { "epoch": 0.9173734610123119, - "grad_norm": 1.2446952464793988, + "grad_norm": 1.2220898934787294, "learning_rate": 1.675092904289116e-07, - "loss": 0.1765, + "loss": 0.1761, "step": 6706 }, { "epoch": 0.9175102599179207, - "grad_norm": 1.1372982634050164, + "grad_norm": 1.135028511334228, "learning_rate": 1.6695818821203913e-07, - "loss": 0.183, + "loss": 0.1829, "step": 6707 }, { "epoch": 0.9176470588235294, - "grad_norm": 1.1572915432030138, + "grad_norm": 1.1355807998978151, "learning_rate": 1.664079786540629e-07, - "loss": 0.1451, + "loss": 0.1446, "step": 6708 }, { "epoch": 0.9177838577291382, - "grad_norm": 1.2245375091155621, + "grad_norm": 1.2043324710767034, "learning_rate": 1.6585866185660494e-07, - "loss": 0.1854, + "loss": 0.1858, "step": 6709 }, { "epoch": 0.9179206566347469, - "grad_norm": 0.9302722916215579, + "grad_norm": 0.9154465136268821, "learning_rate": 1.6531023792112467e-07, - "loss": 0.1481, + "loss": 0.1472, "step": 6710 }, { "epoch": 0.9180574555403557, - "grad_norm": 1.2979301896428768, + "grad_norm": 1.2860178741631083, "learning_rate": 1.6476270694891383e-07, - "loss": 0.1651, + "loss": 0.1655, "step": 6711 }, { "epoch": 0.9181942544459645, - "grad_norm": 1.4171229788914268, + "grad_norm": 1.3971577967565776, "learning_rate": 1.6421606904110265e-07, - "loss": 0.1751, + "loss": 0.1741, "step": 6712 }, { "epoch": 0.9183310533515732, - "grad_norm": 1.1260306078747064, + "grad_norm": 1.127862878019238, "learning_rate": 1.6367032429865425e-07, - "loss": 0.15, + "loss": 0.1522, "step": 6713 }, { "epoch": 0.918467852257182, - "grad_norm": 1.120104765308721, + "grad_norm": 1.1125876731863193, "learning_rate": 1.6312547282236636e-07, - "loss": 0.13, + "loss": 0.131, "step": 6714 }, { "epoch": 0.9186046511627907, - "grad_norm": 1.3669001863303964, + "grad_norm": 1.3451581860539152, "learning_rate": 1.6258151471287397e-07, - "loss": 0.1834, + "loss": 0.1812, "step": 6715 }, { "epoch": 0.9187414500683995, - "grad_norm": 1.453348623118948, + "grad_norm": 1.4327632242358213, "learning_rate": 1.6203845007064456e-07, - "loss": 0.2018, + "loss": 0.2025, "step": 6716 }, { "epoch": 0.9188782489740082, - "grad_norm": 1.2064140556460885, + "grad_norm": 1.1839578655311283, "learning_rate": 1.6149627899598285e-07, - "loss": 0.1636, + "loss": 0.1621, "step": 6717 }, { "epoch": 0.919015047879617, - "grad_norm": 1.034063308887074, + "grad_norm": 1.0308592213726695, "learning_rate": 1.60955001589026e-07, - "loss": 0.1707, + "loss": 0.1719, "step": 6718 }, { "epoch": 0.9191518467852258, - "grad_norm": 1.2204277936074786, + "grad_norm": 1.2098518049097011, "learning_rate": 1.6041461794974844e-07, - "loss": 0.1742, + "loss": 0.1727, "step": 6719 }, { "epoch": 0.9192886456908345, - "grad_norm": 1.2155963926272504, + "grad_norm": 1.2078896195370539, "learning_rate": 1.5987512817795925e-07, - "loss": 0.1837, + "loss": 0.1859, "step": 6720 }, { "epoch": 0.9194254445964433, - "grad_norm": 1.201221468336731, + "grad_norm": 1.1992427442337226, "learning_rate": 1.5933653237329982e-07, - "loss": 0.1511, + "loss": 0.1498, "step": 6721 }, { "epoch": 0.919562243502052, - "grad_norm": 1.3810700193573828, + "grad_norm": 1.3690713650483481, "learning_rate": 1.5879883063525058e-07, - "loss": 0.1956, + "loss": 0.1943, "step": 6722 }, { "epoch": 0.9196990424076608, - "grad_norm": 1.0138738752017644, + "grad_norm": 1.0126675474297542, "learning_rate": 1.5826202306312266e-07, - "loss": 0.1481, + "loss": 0.1484, "step": 6723 }, { "epoch": 0.9198358413132695, - "grad_norm": 1.3413189754812407, + "grad_norm": 1.3197842297518592, "learning_rate": 1.5772610975606562e-07, - "loss": 0.1642, + "loss": 0.1633, "step": 6724 }, { "epoch": 0.9199726402188783, - "grad_norm": 1.25371592148559, + "grad_norm": 1.2492732042855539, "learning_rate": 1.5719109081306029e-07, - "loss": 0.1839, + "loss": 0.1856, "step": 6725 }, { "epoch": 0.920109439124487, - "grad_norm": 1.083705284938279, + "grad_norm": 1.0767939954724035, "learning_rate": 1.566569663329265e-07, - "loss": 0.1543, + "loss": 0.1532, "step": 6726 }, { "epoch": 0.9202462380300958, - "grad_norm": 1.3177041183360994, + "grad_norm": 1.3166220808091587, "learning_rate": 1.5612373641431423e-07, - "loss": 0.171, + "loss": 0.1732, "step": 6727 }, { "epoch": 0.9203830369357046, - "grad_norm": 1.2220425706696534, + "grad_norm": 1.2134196100477372, "learning_rate": 1.5559140115571248e-07, - "loss": 0.1605, + "loss": 0.1616, "step": 6728 }, { "epoch": 0.9205198358413132, - "grad_norm": 1.1930147432120233, + "grad_norm": 1.1820730109163395, "learning_rate": 1.550599606554426e-07, - "loss": 0.1624, + "loss": 0.1621, "step": 6729 }, { "epoch": 0.920656634746922, - "grad_norm": 1.4034944767598907, + "grad_norm": 1.3766927403101583, "learning_rate": 1.5452941501166107e-07, - "loss": 0.1726, + "loss": 0.1723, "step": 6730 }, { "epoch": 0.9207934336525307, - "grad_norm": 1.2952032842693293, + "grad_norm": 1.297570007851011, "learning_rate": 1.5399976432235898e-07, - "loss": 0.1852, + "loss": 0.1846, "step": 6731 }, { "epoch": 0.9209302325581395, - "grad_norm": 1.4521973579872642, + "grad_norm": 1.4362747972421066, "learning_rate": 1.5347100868536246e-07, - "loss": 0.2016, + "loss": 0.2009, "step": 6732 }, { "epoch": 0.9210670314637482, - "grad_norm": 1.2451000871231135, + "grad_norm": 1.237760060026598, "learning_rate": 1.529431481983329e-07, - "loss": 0.2062, + "loss": 0.2067, "step": 6733 }, { "epoch": 0.921203830369357, - "grad_norm": 1.1678513189215325, + "grad_norm": 1.1576154925773505, "learning_rate": 1.5241618295876504e-07, - "loss": 0.1549, + "loss": 0.154, "step": 6734 }, { "epoch": 0.9213406292749658, - "grad_norm": 1.387559288657592, + "grad_norm": 1.3917204659846614, "learning_rate": 1.5189011306398937e-07, - "loss": 0.1828, + "loss": 0.1824, "step": 6735 }, { "epoch": 0.9214774281805745, - "grad_norm": 1.2473649284566097, + "grad_norm": 1.2286158938158174, "learning_rate": 1.5136493861117095e-07, - "loss": 0.154, + "loss": 0.1536, "step": 6736 }, { "epoch": 0.9216142270861833, - "grad_norm": 1.2944402698236452, + "grad_norm": 1.299381438880411, "learning_rate": 1.5084065969730778e-07, - "loss": 0.1854, + "loss": 0.1845, "step": 6737 }, { "epoch": 0.921751025991792, - "grad_norm": 1.2142366325631972, + "grad_norm": 1.2027501463138115, "learning_rate": 1.5031727641923454e-07, - "loss": 0.1621, + "loss": 0.1614, "step": 6738 }, { "epoch": 0.9218878248974008, - "grad_norm": 1.4734567675705348, + "grad_norm": 1.4660315402587007, "learning_rate": 1.4979478887361954e-07, - "loss": 0.2196, + "loss": 0.2183, "step": 6739 }, { "epoch": 0.9220246238030095, - "grad_norm": 1.3569375132158599, + "grad_norm": 1.345025552245264, "learning_rate": 1.492731971569661e-07, - "loss": 0.1849, + "loss": 0.1858, "step": 6740 }, { "epoch": 0.9221614227086183, - "grad_norm": 1.2453860534576908, + "grad_norm": 1.248187379882698, "learning_rate": 1.4875250136561158e-07, - "loss": 0.1741, + "loss": 0.1756, "step": 6741 }, { "epoch": 0.922298221614227, - "grad_norm": 1.305753376368068, + "grad_norm": 1.2806328598598984, "learning_rate": 1.48232701595728e-07, - "loss": 0.1577, + "loss": 0.1569, "step": 6742 }, { "epoch": 0.9224350205198358, - "grad_norm": 1.0169939894160862, + "grad_norm": 1.0031060243425358, "learning_rate": 1.4771379794332185e-07, - "loss": 0.1457, + "loss": 0.146, "step": 6743 }, { "epoch": 0.9225718194254446, - "grad_norm": 1.114070867039981, + "grad_norm": 1.1242324654165101, "learning_rate": 1.4719579050423426e-07, - "loss": 0.2034, + "loss": 0.2059, "step": 6744 }, { "epoch": 0.9227086183310533, - "grad_norm": 1.0864473084524147, + "grad_norm": 1.0657328858974249, "learning_rate": 1.4667867937414092e-07, - "loss": 0.1812, + "loss": 0.1794, "step": 6745 }, { "epoch": 0.9228454172366621, - "grad_norm": 1.2712063564998999, + "grad_norm": 1.2597701324143227, "learning_rate": 1.461624646485521e-07, - "loss": 0.1715, + "loss": 0.1712, "step": 6746 }, { "epoch": 0.9229822161422708, - "grad_norm": 1.3317916134479353, + "grad_norm": 1.332190675027402, "learning_rate": 1.4564714642281207e-07, - "loss": 0.1899, + "loss": 0.1902, "step": 6747 }, { "epoch": 0.9231190150478796, - "grad_norm": 1.2419043313892655, + "grad_norm": 1.2309126576361682, "learning_rate": 1.4513272479209917e-07, - "loss": 0.1613, + "loss": 0.1621, "step": 6748 }, { "epoch": 0.9232558139534883, - "grad_norm": 1.1082206589645838, + "grad_norm": 1.1139813358121924, "learning_rate": 1.4461919985142737e-07, - "loss": 0.1664, + "loss": 0.1682, "step": 6749 }, { "epoch": 0.9233926128590971, - "grad_norm": 1.2754938730071643, + "grad_norm": 1.2615978912462322, "learning_rate": 1.4410657169564414e-07, - "loss": 0.2015, + "loss": 0.2021, "step": 6750 }, { "epoch": 0.9235294117647059, - "grad_norm": 1.0445864497943416, + "grad_norm": 1.0449760094930156, "learning_rate": 1.435948404194304e-07, - "loss": 0.1547, + "loss": 0.1557, "step": 6751 }, { "epoch": 0.9236662106703146, - "grad_norm": 1.3959266208708174, + "grad_norm": 1.3705197397613433, "learning_rate": 1.43084006117305e-07, - "loss": 0.1801, + "loss": 0.1782, "step": 6752 }, { "epoch": 0.9238030095759234, - "grad_norm": 1.32301017572352, + "grad_norm": 1.5279515581378573, "learning_rate": 1.425740688836158e-07, - "loss": 0.2039, + "loss": 0.2043, "step": 6753 }, { "epoch": 0.9239398084815321, - "grad_norm": 1.3328089787788324, + "grad_norm": 1.325669571125469, "learning_rate": 1.420650288125497e-07, - "loss": 0.1812, + "loss": 0.1815, "step": 6754 }, { "epoch": 0.9240766073871409, - "grad_norm": 1.090904314735331, + "grad_norm": 1.1047540450765378, "learning_rate": 1.4155688599812479e-07, - "loss": 0.1603, + "loss": 0.1621, "step": 6755 }, { "epoch": 0.9242134062927496, - "grad_norm": 1.3490611951619857, + "grad_norm": 1.3501403722776923, "learning_rate": 1.4104964053419602e-07, - "loss": 0.1833, + "loss": 0.1844, "step": 6756 }, { "epoch": 0.9243502051983584, - "grad_norm": 1.2438039248027914, + "grad_norm": 1.2337574909129772, "learning_rate": 1.4054329251444953e-07, - "loss": 0.1752, + "loss": 0.1741, "step": 6757 }, { "epoch": 0.9244870041039671, - "grad_norm": 1.300464552371146, + "grad_norm": 1.2950082955671995, "learning_rate": 1.400378420324089e-07, - "loss": 0.1754, + "loss": 0.1767, "step": 6758 }, { "epoch": 0.9246238030095759, - "grad_norm": 1.2371958484175425, + "grad_norm": 1.229622505773897, "learning_rate": 1.395332891814294e-07, - "loss": 0.1941, + "loss": 0.1955, "step": 6759 }, { "epoch": 0.9247606019151847, - "grad_norm": 1.434166308916467, + "grad_norm": 1.4309520308823354, "learning_rate": 1.3902963405470148e-07, - "loss": 0.1803, + "loss": 0.1804, "step": 6760 }, { "epoch": 0.9248974008207934, - "grad_norm": 1.3854059302329496, + "grad_norm": 1.384936932726972, "learning_rate": 1.3852687674525078e-07, - "loss": 0.1781, + "loss": 0.1801, "step": 6761 }, { "epoch": 0.9250341997264022, - "grad_norm": 1.3177719708672315, + "grad_norm": 1.2915413278796297, "learning_rate": 1.3802501734593522e-07, - "loss": 0.1805, + "loss": 0.1782, "step": 6762 }, { "epoch": 0.9251709986320109, - "grad_norm": 1.3981712562463384, + "grad_norm": 1.395249555010031, "learning_rate": 1.3752405594944894e-07, - "loss": 0.1977, + "loss": 0.1994, "step": 6763 }, { "epoch": 0.9253077975376197, - "grad_norm": 1.369441109416656, + "grad_norm": 1.375503909018513, "learning_rate": 1.37023992648318e-07, - "loss": 0.1951, + "loss": 0.1967, "step": 6764 }, { "epoch": 0.9254445964432284, - "grad_norm": 1.213249322276995, + "grad_norm": 1.1913356671830084, "learning_rate": 1.3652482753490403e-07, - "loss": 0.17, + "loss": 0.1696, "step": 6765 }, { "epoch": 0.9255813953488372, - "grad_norm": 1.4526386508467122, + "grad_norm": 1.4518285623590688, "learning_rate": 1.3602656070140275e-07, - "loss": 0.1792, + "loss": 0.179, "step": 6766 }, { "epoch": 0.925718194254446, - "grad_norm": 1.4577288228828715, + "grad_norm": 1.4604142043849302, "learning_rate": 1.3552919223984273e-07, - "loss": 0.1963, + "loss": 0.1969, "step": 6767 }, { "epoch": 0.9258549931600547, - "grad_norm": 1.2012200463998914, + "grad_norm": 1.1875680564886362, "learning_rate": 1.3503272224208886e-07, - "loss": 0.1567, + "loss": 0.1564, "step": 6768 }, { "epoch": 0.9259917920656635, - "grad_norm": 1.22584620433506, + "grad_norm": 1.2223396388095824, "learning_rate": 1.3453715079983775e-07, - "loss": 0.1517, + "loss": 0.1523, "step": 6769 }, { "epoch": 0.9261285909712722, - "grad_norm": 1.0928845587194567, + "grad_norm": 1.0947209234595405, "learning_rate": 1.3404247800462177e-07, - "loss": 0.1627, + "loss": 0.1638, "step": 6770 }, { "epoch": 0.926265389876881, - "grad_norm": 1.2843148877322592, + "grad_norm": 1.2792919159319587, "learning_rate": 1.335487039478056e-07, - "loss": 0.1571, + "loss": 0.1573, "step": 6771 }, { "epoch": 0.9264021887824897, - "grad_norm": 1.3453949054375771, + "grad_norm": 1.3441769029473494, "learning_rate": 1.3305582872058965e-07, - "loss": 0.1983, + "loss": 0.201, "step": 6772 }, { "epoch": 0.9265389876880985, - "grad_norm": 1.3035811220811413, + "grad_norm": 1.292597418270409, "learning_rate": 1.3256385241400717e-07, - "loss": 0.1921, + "loss": 0.1929, "step": 6773 }, { "epoch": 0.9266757865937072, - "grad_norm": 1.1099238578022181, + "grad_norm": 1.1093307001337658, "learning_rate": 1.3207277511892602e-07, - "loss": 0.1399, + "loss": 0.1397, "step": 6774 }, { "epoch": 0.926812585499316, - "grad_norm": 1.2196002950180156, + "grad_norm": 1.2144871361801244, "learning_rate": 1.3158259692604814e-07, - "loss": 0.1682, + "loss": 0.1689, "step": 6775 }, { "epoch": 0.9269493844049248, - "grad_norm": 1.3768896048656272, + "grad_norm": 1.3800232160040007, "learning_rate": 1.3109331792590774e-07, - "loss": 0.2139, + "loss": 0.2155, "step": 6776 }, { "epoch": 0.9270861833105335, - "grad_norm": 1.1569361027612661, + "grad_norm": 1.1660605626751983, "learning_rate": 1.3060493820887643e-07, - "loss": 0.198, + "loss": 0.2013, "step": 6777 }, { "epoch": 0.9272229822161423, - "grad_norm": 1.419793668708625, + "grad_norm": 1.4029237533495815, "learning_rate": 1.301174578651554e-07, - "loss": 0.1802, + "loss": 0.1792, "step": 6778 }, { "epoch": 0.927359781121751, - "grad_norm": 1.4316789529041973, + "grad_norm": 1.4333847716372463, "learning_rate": 1.2963087698478317e-07, - "loss": 0.2146, + "loss": 0.2145, "step": 6779 }, { "epoch": 0.9274965800273598, - "grad_norm": 1.132765923923549, + "grad_norm": 1.1319872607264503, "learning_rate": 1.291451956576306e-07, - "loss": 0.183, + "loss": 0.1824, "step": 6780 }, { "epoch": 0.9276333789329685, - "grad_norm": 1.1659446042402721, + "grad_norm": 1.1639870095687246, "learning_rate": 1.2866041397340211e-07, - "loss": 0.1761, + "loss": 0.1771, "step": 6781 }, { "epoch": 0.9277701778385773, - "grad_norm": 1.2269911318535753, + "grad_norm": 1.2219656192864004, "learning_rate": 1.281765320216366e-07, "loss": 0.1475, "step": 6782 }, { "epoch": 0.9279069767441861, - "grad_norm": 1.45145390651349, + "grad_norm": 1.457381265889995, "learning_rate": 1.27693549891707e-07, - "loss": 0.1729, + "loss": 0.1731, "step": 6783 }, { "epoch": 0.9280437756497948, - "grad_norm": 1.1495402986683172, + "grad_norm": 1.1401366099551062, "learning_rate": 1.2721146767282033e-07, - "loss": 0.1686, + "loss": 0.1694, "step": 6784 }, { "epoch": 0.9281805745554036, - "grad_norm": 1.4309631954695214, + "grad_norm": 1.405253125081689, "learning_rate": 1.2673028545401532e-07, - "loss": 0.2019, + "loss": 0.2039, "step": 6785 }, { "epoch": 0.9283173734610123, - "grad_norm": 1.176438636972183, + "grad_norm": 1.1667568749396813, "learning_rate": 1.262500033241676e-07, - "loss": 0.1534, + "loss": 0.1538, "step": 6786 }, { "epoch": 0.9284541723666211, - "grad_norm": 1.1178510619143804, + "grad_norm": 1.111609913649867, "learning_rate": 1.2577062137198283e-07, - "loss": 0.1791, + "loss": 0.1816, "step": 6787 }, { "epoch": 0.9285909712722298, - "grad_norm": 1.4006184346704338, + "grad_norm": 1.4005338113615715, "learning_rate": 1.2529213968600408e-07, - "loss": 0.2452, + "loss": 0.2459, "step": 6788 }, { "epoch": 0.9287277701778386, - "grad_norm": 1.2124544462040039, + "grad_norm": 1.2042252053490894, "learning_rate": 1.248145583546062e-07, - "loss": 0.1619, + "loss": 0.1635, "step": 6789 }, { "epoch": 0.9288645690834473, - "grad_norm": 1.379628564209193, + "grad_norm": 1.3794439173976791, "learning_rate": 1.2433787746599757e-07, - "loss": 0.1892, + "loss": 0.19, "step": 6790 }, { "epoch": 0.9290013679890561, - "grad_norm": 1.3690503024570966, + "grad_norm": 1.3350403225077336, "learning_rate": 1.2386209710822162e-07, - "loss": 0.1918, + "loss": 0.1905, "step": 6791 }, { "epoch": 0.9291381668946649, - "grad_norm": 1.090997159096604, + "grad_norm": 1.0882176386581668, "learning_rate": 1.233872173691536e-07, - "loss": 0.1463, + "loss": 0.1455, "step": 6792 }, { "epoch": 0.9292749658002736, - "grad_norm": 1.2770569064535158, + "grad_norm": 1.274355700036842, "learning_rate": 1.2291323833650448e-07, "loss": 0.2086, "step": 6793 }, { "epoch": 0.9294117647058824, - "grad_norm": 1.1752679300089668, + "grad_norm": 1.1460690984829254, "learning_rate": 1.22440160097817e-07, - "loss": 0.1479, + "loss": 0.1457, "step": 6794 }, { "epoch": 0.9295485636114911, - "grad_norm": 1.3060573350490043, + "grad_norm": 1.2784995554249035, "learning_rate": 1.219679827404685e-07, - "loss": 0.2012, + "loss": 0.1998, "step": 6795 }, { "epoch": 0.9296853625170999, - "grad_norm": 1.286646510704948, + "grad_norm": 1.270697374188101, "learning_rate": 1.2149670635166977e-07, - "loss": 0.178, + "loss": 0.1799, "step": 6796 }, { "epoch": 0.9298221614227086, - "grad_norm": 1.5251821159058605, + "grad_norm": 1.5137236698908707, "learning_rate": 1.210263310184656e-07, - "loss": 0.1973, + "loss": 0.1962, "step": 6797 }, { "epoch": 0.9299589603283174, - "grad_norm": 1.5990263448896311, + "grad_norm": 1.5958994597820373, "learning_rate": 1.2055685682773321e-07, - "loss": 0.2177, + "loss": 0.2226, "step": 6798 }, { "epoch": 0.9300957592339262, - "grad_norm": 1.4660522891936159, + "grad_norm": 1.4276218249619206, "learning_rate": 1.200882838661843e-07, - "loss": 0.2155, + "loss": 0.213, "step": 6799 }, { "epoch": 0.9302325581395349, - "grad_norm": 1.1388515583361702, + "grad_norm": 1.1275204017383962, "learning_rate": 1.196206122203647e-07, - "loss": 0.1518, + "loss": 0.1512, "step": 6800 }, { "epoch": 0.9302325581395349, - "eval_loss": 0.17175297439098358, - "eval_runtime": 5.9431, - "eval_samples_per_second": 5.048, - "eval_steps_per_second": 1.346, + "eval_loss": 0.1719616949558258, + "eval_runtime": 5.93, + "eval_samples_per_second": 5.059, + "eval_steps_per_second": 1.349, "step": 6800 }, { "epoch": 0.9303693570451437, - "grad_norm": 1.2898419770092484, + "grad_norm": 1.272045650626294, "learning_rate": 1.1915384197665136e-07, - "loss": 0.1838, + "loss": 0.1846, "step": 6801 }, { "epoch": 0.9305061559507524, - "grad_norm": 1.1605620401347445, + "grad_norm": 1.1373831688695863, "learning_rate": 1.186879732212587e-07, - "loss": 0.1587, + "loss": 0.1566, "step": 6802 }, { "epoch": 0.9306429548563612, - "grad_norm": 1.5030285290798873, + "grad_norm": 1.505800779691173, "learning_rate": 1.1822300604022952e-07, - "loss": 0.2084, + "loss": 0.2085, "step": 6803 }, { "epoch": 0.9307797537619699, - "grad_norm": 1.1944309980684555, + "grad_norm": 1.189702374057672, "learning_rate": 1.1775894051944514e-07, - "loss": 0.1678, + "loss": 0.1689, "step": 6804 }, { "epoch": 0.9309165526675787, - "grad_norm": 1.398047439587484, + "grad_norm": 1.3883445731537514, "learning_rate": 1.17295776744617e-07, - "loss": 0.1717, + "loss": 0.1716, "step": 6805 }, { "epoch": 0.9310533515731874, - "grad_norm": 1.276355037481626, + "grad_norm": 1.2640550361575245, "learning_rate": 1.1683351480129168e-07, - "loss": 0.1631, + "loss": 0.1636, "step": 6806 }, { "epoch": 0.9311901504787962, - "grad_norm": 1.088537132347951, + "grad_norm": 1.075461578832932, "learning_rate": 1.1637215477484754e-07, - "loss": 0.1804, + "loss": 0.1798, "step": 6807 }, { "epoch": 0.931326949384405, - "grad_norm": 1.3032336739929147, + "grad_norm": 1.292325974443617, "learning_rate": 1.1591169675049862e-07, - "loss": 0.1575, + "loss": 0.158, "step": 6808 }, { "epoch": 0.9314637482900137, - "grad_norm": 1.2051146826025139, + "grad_norm": 1.210783104885042, "learning_rate": 1.1545214081329082e-07, - "loss": 0.1774, + "loss": 0.1785, "step": 6809 }, { "epoch": 0.9316005471956225, - "grad_norm": 1.2342921698043943, + "grad_norm": 1.2285176937833366, "learning_rate": 1.1499348704810286e-07, - "loss": 0.186, + "loss": 0.1856, "step": 6810 }, { "epoch": 0.9317373461012312, - "grad_norm": 1.432459901556906, + "grad_norm": 1.426136215516678, "learning_rate": 1.1453573553964925e-07, - "loss": 0.2104, + "loss": 0.2122, "step": 6811 }, { "epoch": 0.93187414500684, - "grad_norm": 1.388316079686815, + "grad_norm": 1.373474163149315, "learning_rate": 1.1407888637247511e-07, - "loss": 0.2225, + "loss": 0.2206, "step": 6812 }, { "epoch": 0.9320109439124487, - "grad_norm": 1.4659469901473217, + "grad_norm": 1.4441499259145916, "learning_rate": 1.1362293963096072e-07, - "loss": 0.1896, + "loss": 0.189, "step": 6813 }, { "epoch": 0.9321477428180575, - "grad_norm": 1.1260238521321502, + "grad_norm": 1.126636056815698, "learning_rate": 1.1316789539931816e-07, - "loss": 0.1671, + "loss": 0.1681, "step": 6814 }, { "epoch": 0.9322845417236663, - "grad_norm": 1.1180813730451864, + "grad_norm": 1.1069380778145501, "learning_rate": 1.1271375376159466e-07, - "loss": 0.1739, + "loss": 0.1744, "step": 6815 }, { "epoch": 0.932421340629275, - "grad_norm": 1.3433323941689566, + "grad_norm": 1.3230994890043293, "learning_rate": 1.1226051480167032e-07, - "loss": 0.2105, + "loss": 0.2108, "step": 6816 }, { "epoch": 0.9325581395348838, - "grad_norm": 1.301784010624131, + "grad_norm": 1.3140116259189383, "learning_rate": 1.1180817860325599e-07, - "loss": 0.1758, + "loss": 0.1755, "step": 6817 }, { "epoch": 0.9326949384404924, - "grad_norm": 0.9860354992003708, + "grad_norm": 0.9758120261036245, "learning_rate": 1.1135674524989981e-07, - "loss": 0.1399, + "loss": 0.14, "step": 6818 }, { "epoch": 0.9328317373461013, - "grad_norm": 1.281510129227907, + "grad_norm": 1.272965078606112, "learning_rate": 1.109062148249801e-07, - "loss": 0.1911, + "loss": 0.1908, "step": 6819 }, { "epoch": 0.93296853625171, - "grad_norm": 1.1867152046690321, + "grad_norm": 1.1878861265207452, "learning_rate": 1.1045658741171028e-07, - "loss": 0.1537, + "loss": 0.1527, "step": 6820 }, { "epoch": 0.9331053351573187, - "grad_norm": 1.4461733578705185, + "grad_norm": 1.4133724748954202, "learning_rate": 1.1000786309313505e-07, - "loss": 0.2122, + "loss": 0.2106, "step": 6821 }, { "epoch": 0.9332421340629274, - "grad_norm": 0.9507167756972169, + "grad_norm": 0.9356448588058535, "learning_rate": 1.0956004195213477e-07, - "loss": 0.1154, + "loss": 0.1147, "step": 6822 }, { "epoch": 0.9333789329685362, - "grad_norm": 1.4366397326677194, + "grad_norm": 1.4134017067411349, "learning_rate": 1.0911312407141994e-07, - "loss": 0.2371, + "loss": 0.2336, "step": 6823 }, { "epoch": 0.933515731874145, - "grad_norm": 1.2688591198121946, + "grad_norm": 1.2473949057647524, "learning_rate": 1.0866710953353732e-07, - "loss": 0.1615, + "loss": 0.1611, "step": 6824 }, { "epoch": 0.9336525307797537, - "grad_norm": 1.1474573205118592, + "grad_norm": 1.1421975260932178, "learning_rate": 1.082219984208649e-07, - "loss": 0.1487, + "loss": 0.1495, "step": 6825 }, { "epoch": 0.9337893296853625, - "grad_norm": 1.277591982205513, + "grad_norm": 1.2662851545152072, "learning_rate": 1.0777779081561468e-07, - "loss": 0.1754, + "loss": 0.1753, "step": 6826 }, { "epoch": 0.9339261285909712, - "grad_norm": 1.3587340961917855, + "grad_norm": 1.3307361540921239, "learning_rate": 1.0733448679983105e-07, - "loss": 0.1679, + "loss": 0.1681, "step": 6827 }, { "epoch": 0.93406292749658, - "grad_norm": 1.1646530831063686, + "grad_norm": 1.1463444804789673, "learning_rate": 1.068920864553924e-07, - "loss": 0.174, + "loss": 0.1754, "step": 6828 }, { "epoch": 0.9341997264021887, - "grad_norm": 1.4583602945361716, + "grad_norm": 1.4526948791480452, "learning_rate": 1.0645058986400891e-07, - "loss": 0.2206, + "loss": 0.2223, "step": 6829 }, { "epoch": 0.9343365253077975, - "grad_norm": 1.3286489985627132, + "grad_norm": 1.322407814408343, "learning_rate": 1.0600999710722537e-07, - "loss": 0.175, + "loss": 0.1766, "step": 6830 }, { "epoch": 0.9344733242134063, - "grad_norm": 1.4598538235114393, + "grad_norm": 1.4681303921047688, "learning_rate": 1.0557030826641834e-07, - "loss": 0.1831, + "loss": 0.1833, "step": 6831 }, { "epoch": 0.934610123119015, - "grad_norm": 1.0289140050647945, + "grad_norm": 1.0241656945481998, "learning_rate": 1.0513152342279842e-07, - "loss": 0.1528, + "loss": 0.1518, "step": 6832 }, { "epoch": 0.9347469220246238, - "grad_norm": 1.213450332642203, + "grad_norm": 1.203418316805385, "learning_rate": 1.0469364265740855e-07, - "loss": 0.1715, + "loss": 0.1711, "step": 6833 }, { "epoch": 0.9348837209302325, - "grad_norm": 1.242679578644635, + "grad_norm": 1.2344917577840762, "learning_rate": 1.0425666605112516e-07, - "loss": 0.1917, + "loss": 0.1915, "step": 6834 }, { "epoch": 0.9350205198358413, - "grad_norm": 1.27143605132813, + "grad_norm": 1.2689917865903426, "learning_rate": 1.0382059368465703e-07, - "loss": 0.174, + "loss": 0.1736, "step": 6835 }, { "epoch": 0.93515731874145, - "grad_norm": 1.3948076249789616, + "grad_norm": 1.4037926731299835, "learning_rate": 1.0338542563854748e-07, - "loss": 0.2102, + "loss": 0.2101, "step": 6836 }, { "epoch": 0.9352941176470588, - "grad_norm": 1.1652315469229626, + "grad_norm": 1.1611361309179844, "learning_rate": 1.0295116199317057e-07, - "loss": 0.1461, + "loss": 0.1467, "step": 6837 }, { "epoch": 0.9354309165526675, - "grad_norm": 1.3912690490684105, + "grad_norm": 1.3996286935484938, "learning_rate": 1.0251780282873546e-07, - "loss": 0.1853, + "loss": 0.1902, "step": 6838 }, { "epoch": 0.9355677154582763, - "grad_norm": 1.239260509628161, + "grad_norm": 1.2630138852860493, "learning_rate": 1.0208534822528148e-07, - "loss": 0.2149, + "loss": 0.2132, "step": 6839 }, { "epoch": 0.9357045143638851, - "grad_norm": 1.3424356307141483, + "grad_norm": 1.3587416621514832, "learning_rate": 1.0165379826268417e-07, - "loss": 0.171, + "loss": 0.1724, "step": 6840 }, { "epoch": 0.9358413132694938, - "grad_norm": 1.1831635811311367, + "grad_norm": 1.1776374802265506, "learning_rate": 1.0122315302065034e-07, "loss": 0.158, "step": 6841 }, { "epoch": 0.9359781121751026, - "grad_norm": 1.3920195865251044, + "grad_norm": 1.369605998292549, "learning_rate": 1.0079341257871966e-07, - "loss": 0.1722, + "loss": 0.1719, "step": 6842 }, { "epoch": 0.9361149110807113, - "grad_norm": 1.2511392031659707, + "grad_norm": 1.245428434961764, "learning_rate": 1.0036457701626424e-07, - "loss": 0.1916, + "loss": 0.193, "step": 6843 }, { "epoch": 0.9362517099863201, - "grad_norm": 1.3954376512529563, + "grad_norm": 1.3662262549197643, "learning_rate": 9.993664641249012e-08, - "loss": 0.1646, + "loss": 0.165, "step": 6844 }, { "epoch": 0.9363885088919288, - "grad_norm": 1.2593367841182785, + "grad_norm": 1.2511126043483678, "learning_rate": 9.950962084643634e-08, - "loss": 0.1754, + "loss": 0.1752, "step": 6845 }, { "epoch": 0.9365253077975376, - "grad_norm": 1.3108755346483718, + "grad_norm": 1.291577588308792, "learning_rate": 9.908350039697312e-08, - "loss": 0.1767, + "loss": 0.1751, "step": 6846 }, { "epoch": 0.9366621067031464, - "grad_norm": 1.3319714458798417, + "grad_norm": 1.3217343363425993, "learning_rate": 9.865828514280474e-08, - "loss": 0.1804, + "loss": 0.1812, "step": 6847 }, { "epoch": 0.9367989056087551, - "grad_norm": 1.463482784559776, + "grad_norm": 1.4384681052622679, "learning_rate": 9.823397516246835e-08, - "loss": 0.1761, + "loss": 0.1746, "step": 6848 }, { "epoch": 0.9369357045143639, - "grad_norm": 1.2753813023119076, + "grad_norm": 1.2848098322309482, "learning_rate": 9.781057053433408e-08, - "loss": 0.1657, + "loss": 0.1655, "step": 6849 }, { "epoch": 0.9370725034199726, - "grad_norm": 1.4385827840760288, + "grad_norm": 1.4368128500712503, "learning_rate": 9.738807133660433e-08, - "loss": 0.1938, + "loss": 0.1931, "step": 6850 }, { "epoch": 0.9372093023255814, - "grad_norm": 1.392733380529939, + "grad_norm": 1.3702271369046022, "learning_rate": 9.696647764731337e-08, - "loss": 0.1988, + "loss": 0.1977, "step": 6851 }, { "epoch": 0.9373461012311901, - "grad_norm": 1.1543686354691591, + "grad_norm": 1.1596145597638017, "learning_rate": 9.654578954433058e-08, - "loss": 0.1904, + "loss": 0.1917, "step": 6852 }, { "epoch": 0.9374829001367989, - "grad_norm": 1.1166272417346867, + "grad_norm": 1.102672135270378, "learning_rate": 9.612600710535546e-08, - "loss": 0.1446, + "loss": 0.144, "step": 6853 }, { "epoch": 0.9376196990424076, - "grad_norm": 1.2259832187194686, + "grad_norm": 1.2229333667482738, "learning_rate": 9.570713040792268e-08, - "loss": 0.1691, + "loss": 0.1707, "step": 6854 }, { "epoch": 0.9377564979480164, - "grad_norm": 1.5457627934717961, + "grad_norm": 1.5193091554315086, "learning_rate": 9.528915952939755e-08, - "loss": 0.2005, + "loss": 0.2017, "step": 6855 }, { "epoch": 0.9378932968536252, - "grad_norm": 1.1967493220413588, + "grad_norm": 1.1852610587225583, "learning_rate": 9.487209454697887e-08, - "loss": 0.1627, + "loss": 0.1634, "step": 6856 }, { "epoch": 0.9380300957592339, - "grad_norm": 1.306411634185705, + "grad_norm": 1.3044727080797258, "learning_rate": 9.445593553769893e-08, - "loss": 0.2206, + "loss": 0.2219, "step": 6857 }, { "epoch": 0.9381668946648427, - "grad_norm": 1.2089182707023458, + "grad_norm": 1.205286760124674, "learning_rate": 9.404068257842125e-08, - "loss": 0.1861, + "loss": 0.1868, "step": 6858 }, { "epoch": 0.9383036935704514, - "grad_norm": 1.3296199222359255, + "grad_norm": 1.3159545703619286, "learning_rate": 9.36263357458439e-08, - "loss": 0.1623, + "loss": 0.1616, "step": 6859 }, { "epoch": 0.9384404924760602, - "grad_norm": 1.3697714043868798, + "grad_norm": 1.3586521236605376, "learning_rate": 9.321289511649456e-08, - "loss": 0.1873, + "loss": 0.1885, "step": 6860 }, { "epoch": 0.9385772913816689, - "grad_norm": 1.2407816370209066, + "grad_norm": 1.238495096115846, "learning_rate": 9.28003607667366e-08, - "loss": 0.1751, + "loss": 0.1754, "step": 6861 }, { "epoch": 0.9387140902872777, - "grad_norm": 1.0683469640548942, + "grad_norm": 1.066006308097988, "learning_rate": 9.238873277276461e-08, - "loss": 0.1354, + "loss": 0.1355, "step": 6862 }, { "epoch": 0.9388508891928865, - "grad_norm": 1.110919729188614, + "grad_norm": 1.1037018284264293, "learning_rate": 9.197801121060612e-08, - "loss": 0.1518, + "loss": 0.1538, "step": 6863 }, { "epoch": 0.9389876880984952, - "grad_norm": 1.389963568396718, + "grad_norm": 1.3743306083152804, "learning_rate": 9.156819615612045e-08, - "loss": 0.1612, + "loss": 0.1607, "step": 6864 }, { "epoch": 0.939124487004104, - "grad_norm": 1.320103256025632, + "grad_norm": 1.3328102523922147, "learning_rate": 9.115928768500037e-08, - "loss": 0.1615, + "loss": 0.1644, "step": 6865 }, { "epoch": 0.9392612859097127, - "grad_norm": 1.2842825879348545, + "grad_norm": 1.2815741911329128, "learning_rate": 9.07512858727716e-08, - "loss": 0.1612, + "loss": 0.161, "step": 6866 }, { "epoch": 0.9393980848153215, - "grad_norm": 1.2103813356522337, + "grad_norm": 1.1995994673463177, "learning_rate": 9.034419079479052e-08, - "loss": 0.1616, + "loss": 0.1602, "step": 6867 }, { "epoch": 0.9395348837209302, - "grad_norm": 1.1707296840753894, + "grad_norm": 1.16849370775202, "learning_rate": 8.993800252624863e-08, - "loss": 0.1763, + "loss": 0.1771, "step": 6868 }, { "epoch": 0.939671682626539, - "grad_norm": 1.2243119056712901, + "grad_norm": 1.219630312540642, "learning_rate": 8.95327211421676e-08, - "loss": 0.1597, + "loss": 0.158, "step": 6869 }, { "epoch": 0.9398084815321477, - "grad_norm": 1.3506718761577854, + "grad_norm": 1.3548281471317585, "learning_rate": 8.912834671740312e-08, - "loss": 0.193, + "loss": 0.1941, "step": 6870 }, { "epoch": 0.9399452804377565, - "grad_norm": 1.4043820457099823, + "grad_norm": 1.3823495155915835, "learning_rate": 8.872487932664209e-08, - "loss": 0.2197, + "loss": 0.2186, "step": 6871 }, { "epoch": 0.9400820793433653, - "grad_norm": 0.9664009312158546, + "grad_norm": 0.9581241213405511, "learning_rate": 8.832231904440492e-08, - "loss": 0.1546, + "loss": 0.1551, "step": 6872 }, { "epoch": 0.940218878248974, - "grad_norm": 1.4668475108631491, + "grad_norm": 1.4810638110053587, "learning_rate": 8.792066594504489e-08, - "loss": 0.1852, + "loss": 0.1869, "step": 6873 }, { "epoch": 0.9403556771545828, - "grad_norm": 1.2138002470217886, + "grad_norm": 1.2040918210600975, "learning_rate": 8.751992010274602e-08, - "loss": 0.1738, + "loss": 0.1735, "step": 6874 }, { "epoch": 0.9404924760601915, - "grad_norm": 0.9832023730419558, + "grad_norm": 0.9681506026360132, "learning_rate": 8.71200815915263e-08, - "loss": 0.1294, + "loss": 0.1285, "step": 6875 }, { "epoch": 0.9406292749658003, - "grad_norm": 1.1883474253112445, + "grad_norm": 1.1682397880848987, "learning_rate": 8.672115048523556e-08, - "loss": 0.1806, + "loss": 0.1797, "step": 6876 }, { "epoch": 0.940766073871409, - "grad_norm": 1.0622085283072913, + "grad_norm": 1.0389996762210019, "learning_rate": 8.632312685755539e-08, - "loss": 0.1644, + "loss": 0.164, "step": 6877 }, { "epoch": 0.9409028727770178, - "grad_norm": 1.0525075298299142, + "grad_norm": 1.0211055175459989, "learning_rate": 8.592601078200147e-08, - "loss": 0.1512, + "loss": 0.1492, "step": 6878 }, { "epoch": 0.9410396716826266, - "grad_norm": 1.3178888060959686, + "grad_norm": 1.2948901222003883, "learning_rate": 8.55298023319201e-08, - "loss": 0.1532, + "loss": 0.1537, "step": 6879 }, { "epoch": 0.9411764705882353, - "grad_norm": 1.374062765183736, + "grad_norm": 1.3960843656317365, "learning_rate": 8.513450158049109e-08, - "loss": 0.156, + "loss": 0.1583, "step": 6880 }, { "epoch": 0.9413132694938441, - "grad_norm": 1.375096246960997, + "grad_norm": 1.3795046419115904, "learning_rate": 8.474010860072545e-08, - "loss": 0.1934, + "loss": 0.1947, "step": 6881 }, { "epoch": 0.9414500683994528, - "grad_norm": 1.4931914798379164, + "grad_norm": 1.4861335253648977, "learning_rate": 8.434662346546884e-08, - "loss": 0.2242, + "loss": 0.2239, "step": 6882 }, { "epoch": 0.9415868673050616, - "grad_norm": 1.4403808634979995, + "grad_norm": 1.4182640150601018, "learning_rate": 8.395404624739534e-08, - "loss": 0.1817, + "loss": 0.1828, "step": 6883 }, { "epoch": 0.9417236662106703, - "grad_norm": 1.218385382089465, + "grad_norm": 1.1957711731786322, "learning_rate": 8.356237701901582e-08, - "loss": 0.1602, + "loss": 0.1591, "step": 6884 }, { "epoch": 0.9418604651162791, - "grad_norm": 1.1417172496301302, + "grad_norm": 1.1288396326232433, "learning_rate": 8.317161585266964e-08, - "loss": 0.163, + "loss": 0.1631, "step": 6885 }, { "epoch": 0.9419972640218878, - "grad_norm": 1.3961314392427793, + "grad_norm": 1.384411263357558, "learning_rate": 8.278176282053075e-08, - "loss": 0.1862, + "loss": 0.187, "step": 6886 }, { "epoch": 0.9421340629274966, - "grad_norm": 1.1877853963422518, + "grad_norm": 1.1787471843144928, "learning_rate": 8.239281799460485e-08, - "loss": 0.1596, + "loss": 0.1583, "step": 6887 }, { "epoch": 0.9422708618331054, - "grad_norm": 1.4032547085843718, + "grad_norm": 1.379012183392345, "learning_rate": 8.200478144672952e-08, - "loss": 0.2085, + "loss": 0.2069, "step": 6888 }, { "epoch": 0.9424076607387141, - "grad_norm": 1.1358107165573932, + "grad_norm": 1.1330065856879978, "learning_rate": 8.161765324857463e-08, - "loss": 0.1571, + "loss": 0.158, "step": 6889 }, { "epoch": 0.9425444596443229, - "grad_norm": 1.4493831584766494, + "grad_norm": 1.4284081639223416, "learning_rate": 8.123143347164297e-08, - "loss": 0.1995, + "loss": 0.2003, "step": 6890 }, { "epoch": 0.9426812585499316, - "grad_norm": 1.0651554648846735, + "grad_norm": 1.0753076679343212, "learning_rate": 8.084612218726807e-08, - "loss": 0.1426, + "loss": 0.1428, "step": 6891 }, { "epoch": 0.9428180574555404, - "grad_norm": 1.3034917843137095, + "grad_norm": 1.285901694467842, "learning_rate": 8.046171946661797e-08, - "loss": 0.1751, + "loss": 0.1749, "step": 6892 }, { "epoch": 0.9429548563611491, - "grad_norm": 1.3794520617396475, + "grad_norm": 1.3728652920604991, "learning_rate": 8.007822538069032e-08, - "loss": 0.1866, + "loss": 0.1868, "step": 6893 }, { "epoch": 0.9430916552667579, - "grad_norm": 0.9672523646038644, + "grad_norm": 0.9730635571280385, "learning_rate": 7.969564000031682e-08, - "loss": 0.159, + "loss": 0.1608, "step": 6894 }, { "epoch": 0.9432284541723667, - "grad_norm": 1.6181421565284706, + "grad_norm": 1.6066097840714093, "learning_rate": 7.931396339616037e-08, - "loss": 0.219, + "loss": 0.2188, "step": 6895 }, { "epoch": 0.9433652530779754, - "grad_norm": 1.297985551671855, + "grad_norm": 1.2739059321284967, "learning_rate": 7.893319563871682e-08, - "loss": 0.1983, + "loss": 0.1982, "step": 6896 }, { "epoch": 0.9435020519835842, - "grad_norm": 1.3187793437942186, + "grad_norm": 1.2922246485463644, "learning_rate": 7.855333679831267e-08, - "loss": 0.1787, + "loss": 0.1777, "step": 6897 }, { "epoch": 0.9436388508891929, - "grad_norm": 1.3019207601281817, + "grad_norm": 1.2922526110919859, "learning_rate": 7.817438694510848e-08, - "loss": 0.1783, + "loss": 0.179, "step": 6898 }, { "epoch": 0.9437756497948017, - "grad_norm": 1.217968760260989, + "grad_norm": 1.190821171337654, "learning_rate": 7.779634614909604e-08, - "loss": 0.1693, + "loss": 0.1685, "step": 6899 }, { "epoch": 0.9439124487004104, - "grad_norm": 1.2382142923816735, + "grad_norm": 1.2135032403579684, "learning_rate": 7.741921448009838e-08, - "loss": 0.1747, + "loss": 0.1755, "step": 6900 }, { "epoch": 0.9439124487004104, - "eval_loss": 0.17164281010627747, - "eval_runtime": 5.9099, - "eval_samples_per_second": 5.076, - "eval_steps_per_second": 1.354, + "eval_loss": 0.17206940054893494, + "eval_runtime": 5.9158, + "eval_samples_per_second": 5.071, + "eval_steps_per_second": 1.352, "step": 6900 }, { "epoch": 0.9440492476060192, - "grad_norm": 1.372099701834893, + "grad_norm": 1.367518639324802, "learning_rate": 7.704299200777199e-08, - "loss": 0.1843, + "loss": 0.1861, "step": 6901 }, { "epoch": 0.9441860465116279, - "grad_norm": 1.2970890645849014, + "grad_norm": 1.2868591177810813, "learning_rate": 7.666767880160464e-08, - "loss": 0.1829, + "loss": 0.183, "step": 6902 }, { "epoch": 0.9443228454172367, - "grad_norm": 1.3574263325085323, + "grad_norm": 1.3350425820086451, "learning_rate": 7.629327493091643e-08, - "loss": 0.1868, + "loss": 0.1855, "step": 6903 }, { "epoch": 0.9444596443228455, - "grad_norm": 1.3239576554488506, + "grad_norm": 1.3090630914207582, "learning_rate": 7.591978046485926e-08, - "loss": 0.2053, + "loss": 0.2033, "step": 6904 }, { "epoch": 0.9445964432284542, - "grad_norm": 1.3013380798059309, + "grad_norm": 1.3047433623366742, "learning_rate": 7.554719547241795e-08, - "loss": 0.1866, + "loss": 0.1885, "step": 6905 }, { "epoch": 0.944733242134063, - "grad_norm": 1.4620473277592145, + "grad_norm": 1.4333998614172154, "learning_rate": 7.517552002240802e-08, - "loss": 0.1831, + "loss": 0.183, "step": 6906 }, { "epoch": 0.9448700410396716, - "grad_norm": 1.4686846440558814, + "grad_norm": 1.4520803554921793, "learning_rate": 7.480475418347732e-08, - "loss": 0.1609, + "loss": 0.161, "step": 6907 }, { "epoch": 0.9450068399452805, - "grad_norm": 1.2687197116555615, + "grad_norm": 1.25510426207059, "learning_rate": 7.443489802410664e-08, - "loss": 0.1985, + "loss": 0.1995, "step": 6908 }, { "epoch": 0.9451436388508891, - "grad_norm": 1.2008899843249874, + "grad_norm": 1.234056752289656, "learning_rate": 7.4065951612608e-08, - "loss": 0.167, + "loss": 0.1671, "step": 6909 }, { "epoch": 0.945280437756498, - "grad_norm": 1.3894791370397621, + "grad_norm": 1.3681586857202725, "learning_rate": 7.36979150171252e-08, - "loss": 0.1809, + "loss": 0.1787, "step": 6910 }, { "epoch": 0.9454172366621068, - "grad_norm": 1.2890180092600865, + "grad_norm": 1.2795742899982125, "learning_rate": 7.3330788305635e-08, - "loss": 0.1829, + "loss": 0.1831, "step": 6911 }, { "epoch": 0.9455540355677154, - "grad_norm": 1.4441181410862869, + "grad_norm": 1.4275034672084517, "learning_rate": 7.296457154594483e-08, - "loss": 0.2099, + "loss": 0.2115, "step": 6912 }, { "epoch": 0.9456908344733242, - "grad_norm": 1.3368806786096605, + "grad_norm": 1.3305729659036722, "learning_rate": 7.259926480569446e-08, - "loss": 0.1772, + "loss": 0.1773, "step": 6913 }, { "epoch": 0.9458276333789329, - "grad_norm": 1.2112551668568698, + "grad_norm": 1.2803389653858208, "learning_rate": 7.223486815235604e-08, - "loss": 0.1713, + "loss": 0.1718, "step": 6914 }, { "epoch": 0.9459644322845417, - "grad_norm": 1.2481538274311919, + "grad_norm": 1.2486428862351169, "learning_rate": 7.187138165323294e-08, - "loss": 0.1346, + "loss": 0.1377, "step": 6915 }, { "epoch": 0.9461012311901504, - "grad_norm": 1.3629445090205532, + "grad_norm": 1.4045000064638273, "learning_rate": 7.150880537546201e-08, - "loss": 0.2253, + "loss": 0.2224, "step": 6916 }, { "epoch": 0.9462380300957592, - "grad_norm": 1.3217913942393913, + "grad_norm": 1.326141986011995, "learning_rate": 7.114713938600915e-08, - "loss": 0.1924, + "loss": 0.1937, "step": 6917 }, { "epoch": 0.9463748290013679, - "grad_norm": 1.0849652125188, + "grad_norm": 1.0830410319137918, "learning_rate": 7.078638375167535e-08, - "loss": 0.1375, + "loss": 0.1378, "step": 6918 }, { "epoch": 0.9465116279069767, - "grad_norm": 1.2762279683989548, + "grad_norm": 1.2948768043684653, "learning_rate": 7.042653853909066e-08, - "loss": 0.1929, + "loss": 0.1942, "step": 6919 }, { "epoch": 0.9466484268125855, - "grad_norm": 1.3754282368924833, + "grad_norm": 1.3596700903785324, "learning_rate": 7.006760381471855e-08, - "loss": 0.2149, + "loss": 0.2157, "step": 6920 }, { "epoch": 0.9467852257181942, - "grad_norm": 1.5922207762508718, + "grad_norm": 1.575391407947089, "learning_rate": 6.970957964485381e-08, - "loss": 0.2215, + "loss": 0.2214, "step": 6921 }, { "epoch": 0.946922024623803, - "grad_norm": 1.4445578734408233, + "grad_norm": 1.424675270039476, "learning_rate": 6.935246609562407e-08, - "loss": 0.2427, + "loss": 0.2416, "step": 6922 }, { "epoch": 0.9470588235294117, - "grad_norm": 1.3464744318346618, + "grad_norm": 1.3469453695098788, "learning_rate": 6.899626323298714e-08, - "loss": 0.1847, + "loss": 0.187, "step": 6923 }, { "epoch": 0.9471956224350205, - "grad_norm": 1.245344969946829, + "grad_norm": 1.2470296114483244, "learning_rate": 6.864097112273371e-08, "loss": 0.1614, "step": 6924 }, { "epoch": 0.9473324213406292, - "grad_norm": 1.3109170279611433, + "grad_norm": 1.328839875683071, "learning_rate": 6.828658983048575e-08, - "loss": 0.181, + "loss": 0.1803, "step": 6925 }, { "epoch": 0.947469220246238, - "grad_norm": 1.4713192739031733, + "grad_norm": 1.4729927414284647, "learning_rate": 6.7933119421697e-08, - "loss": 0.1762, + "loss": 0.1768, "step": 6926 }, { "epoch": 0.9476060191518468, - "grad_norm": 1.6611733033370895, + "grad_norm": 1.6308831884135457, "learning_rate": 6.758055996165358e-08, - "loss": 0.2265, + "loss": 0.2257, "step": 6927 }, { "epoch": 0.9477428180574555, - "grad_norm": 1.3261345272248233, + "grad_norm": 1.3112939282769371, "learning_rate": 6.722891151547284e-08, "loss": 0.1887, "step": 6928 }, { "epoch": 0.9478796169630643, - "grad_norm": 1.2215062218395651, + "grad_norm": 1.2196572965741124, "learning_rate": 6.687817414810393e-08, - "loss": 0.1672, + "loss": 0.1671, "step": 6929 }, { "epoch": 0.948016415868673, - "grad_norm": 1.0606405951679492, + "grad_norm": 1.0566003958770611, "learning_rate": 6.652834792432783e-08, - "loss": 0.162, + "loss": 0.1629, "step": 6930 }, { "epoch": 0.9481532147742818, - "grad_norm": 1.4498183015397785, + "grad_norm": 1.4352592771468349, "learning_rate": 6.617943290875672e-08, - "loss": 0.187, + "loss": 0.1862, "step": 6931 }, { "epoch": 0.9482900136798905, - "grad_norm": 1.345580832749652, + "grad_norm": 1.31696894145969, "learning_rate": 6.583142916583573e-08, - "loss": 0.1998, + "loss": 0.1973, "step": 6932 }, { "epoch": 0.9484268125854993, - "grad_norm": 1.3789664880969137, + "grad_norm": 1.364666479770089, "learning_rate": 6.548433675984011e-08, - "loss": 0.1741, + "loss": 0.175, "step": 6933 }, { "epoch": 0.948563611491108, - "grad_norm": 1.3547139440089182, + "grad_norm": 1.3501329229880505, "learning_rate": 6.513815575487803e-08, - "loss": 0.1818, + "loss": 0.1848, "step": 6934 }, { "epoch": 0.9487004103967168, - "grad_norm": 1.2247831796626734, + "grad_norm": 1.2127854477079127, "learning_rate": 6.479288621488833e-08, - "loss": 0.1464, + "loss": 0.1448, "step": 6935 }, { "epoch": 0.9488372093023256, - "grad_norm": 1.1115323570299793, + "grad_norm": 1.101154013998337, "learning_rate": 6.444852820364222e-08, - "loss": 0.1551, + "loss": 0.1548, "step": 6936 }, { "epoch": 0.9489740082079343, - "grad_norm": 1.2219398827133867, + "grad_norm": 1.2095680221968543, "learning_rate": 6.410508178474218e-08, - "loss": 0.1578, + "loss": 0.1579, "step": 6937 }, { "epoch": 0.9491108071135431, - "grad_norm": 1.1489014953449777, + "grad_norm": 1.131574812199942, "learning_rate": 6.376254702162354e-08, - "loss": 0.1806, + "loss": 0.1792, "step": 6938 }, { "epoch": 0.9492476060191518, - "grad_norm": 1.4955007724627223, + "grad_norm": 1.4744083339796854, "learning_rate": 6.342092397755128e-08, - "loss": 0.1876, + "loss": 0.1846, "step": 6939 }, { "epoch": 0.9493844049247606, - "grad_norm": 1.0929851870505907, + "grad_norm": 1.087767137835665, "learning_rate": 6.30802127156227e-08, - "loss": 0.1947, + "loss": 0.1951, "step": 6940 }, { "epoch": 0.9495212038303693, - "grad_norm": 1.133936913359481, + "grad_norm": 1.1216438025342128, "learning_rate": 6.274041329876745e-08, "loss": 0.1616, "step": 6941 }, { "epoch": 0.9496580027359781, - "grad_norm": 1.2298381854742497, + "grad_norm": 1.251475772044604, "learning_rate": 6.240152578974535e-08, - "loss": 0.2, + "loss": 0.2008, "step": 6942 }, { "epoch": 0.9497948016415869, - "grad_norm": 1.190374816830642, + "grad_norm": 1.1710612491582866, "learning_rate": 6.206355025114962e-08, - "loss": 0.1627, + "loss": 0.1623, "step": 6943 }, { "epoch": 0.9499316005471956, - "grad_norm": 1.0764488589353531, + "grad_norm": 1.0621097014648704, "learning_rate": 6.172648674540426e-08, - "loss": 0.1591, + "loss": 0.1588, "step": 6944 }, { "epoch": 0.9500683994528044, - "grad_norm": 1.1855982938526586, + "grad_norm": 1.2028333833934763, "learning_rate": 6.13903353347639e-08, - "loss": 0.1677, + "loss": 0.1705, "step": 6945 }, { "epoch": 0.9502051983584131, - "grad_norm": 0.9313474498445622, + "grad_norm": 0.9246167743847846, "learning_rate": 6.105509608131555e-08, - "loss": 0.1514, + "loss": 0.1519, "step": 6946 }, { "epoch": 0.9503419972640219, - "grad_norm": 1.131005090707362, + "grad_norm": 1.1232954702842197, "learning_rate": 6.072076904697744e-08, - "loss": 0.1398, + "loss": 0.139, "step": 6947 }, { "epoch": 0.9504787961696306, - "grad_norm": 1.4023177773215112, + "grad_norm": 1.4091438300198347, "learning_rate": 6.038735429349963e-08, - "loss": 0.1831, + "loss": 0.1855, "step": 6948 }, { "epoch": 0.9506155950752394, - "grad_norm": 0.9305887125144928, + "grad_norm": 0.9245413844800492, "learning_rate": 6.005485188246396e-08, - "loss": 0.1292, + "loss": 0.1293, "step": 6949 }, { "epoch": 0.9507523939808481, - "grad_norm": 1.350533373705572, + "grad_norm": 1.349585457632282, "learning_rate": 5.972326187528298e-08, - "loss": 0.2131, + "loss": 0.2137, "step": 6950 }, { "epoch": 0.9508891928864569, - "grad_norm": 1.3901648876570403, + "grad_norm": 1.3811323433514087, "learning_rate": 5.939258433320105e-08, - "loss": 0.2125, + "loss": 0.2158, "step": 6951 }, { "epoch": 0.9510259917920657, - "grad_norm": 1.389699613006028, + "grad_norm": 1.3646209244325676, "learning_rate": 5.9062819317294296e-08, - "loss": 0.2051, + "loss": 0.2044, "step": 6952 }, { "epoch": 0.9511627906976744, - "grad_norm": 1.2742523184203074, + "grad_norm": 1.2505261960786036, "learning_rate": 5.873396688847011e-08, - "loss": 0.1672, + "loss": 0.1659, "step": 6953 }, { "epoch": 0.9512995896032832, - "grad_norm": 1.2309070523488823, + "grad_norm": 1.213454147041453, "learning_rate": 5.8406027107467126e-08, - "loss": 0.172, + "loss": 0.1717, "step": 6954 }, { "epoch": 0.9514363885088919, - "grad_norm": 1.2457793305792713, + "grad_norm": 1.2334685529960594, "learning_rate": 5.807900003485523e-08, "loss": 0.172, "step": 6955 }, { "epoch": 0.9515731874145007, - "grad_norm": 1.1870562172380665, + "grad_norm": 1.188237494886542, "learning_rate": 5.7752885731036656e-08, - "loss": 0.1605, + "loss": 0.1615, "step": 6956 }, { "epoch": 0.9517099863201094, - "grad_norm": 1.424078437083046, + "grad_norm": 1.3806235234609578, "learning_rate": 5.742768425624434e-08, - "loss": 0.197, + "loss": 0.1937, "step": 6957 }, { "epoch": 0.9518467852257182, - "grad_norm": 1.2887879022410649, + "grad_norm": 1.3041549075100878, "learning_rate": 5.710339567054191e-08, - "loss": 0.19, + "loss": 0.1949, "step": 6958 }, { "epoch": 0.951983584131327, - "grad_norm": 1.3092227240817598, + "grad_norm": 1.2837659179824217, "learning_rate": 5.678002003382588e-08, - "loss": 0.1914, + "loss": 0.1918, "step": 6959 }, { "epoch": 0.9521203830369357, - "grad_norm": 1.5096576351762174, + "grad_norm": 1.4745375284034468, "learning_rate": 5.6457557405824036e-08, - "loss": 0.2376, + "loss": 0.237, "step": 6960 }, { "epoch": 0.9522571819425445, - "grad_norm": 1.4110311963418667, + "grad_norm": 1.4140721389842736, "learning_rate": 5.6136007846093746e-08, - "loss": 0.1866, + "loss": 0.188, "step": 6961 }, { "epoch": 0.9523939808481532, - "grad_norm": 1.3587791163748109, + "grad_norm": 1.360053130424771, "learning_rate": 5.581537141402582e-08, - "loss": 0.1975, + "loss": 0.1982, "step": 6962 }, { "epoch": 0.952530779753762, - "grad_norm": 1.5540720619539208, + "grad_norm": 1.5465670826374789, "learning_rate": 5.549564816884123e-08, - "loss": 0.159, + "loss": 0.1615, "step": 6963 }, { "epoch": 0.9526675786593707, - "grad_norm": 1.1172155286519487, + "grad_norm": 1.1095379265413592, "learning_rate": 5.517683816959218e-08, - "loss": 0.1453, + "loss": 0.1465, "step": 6964 }, { "epoch": 0.9528043775649795, - "grad_norm": 1.2417978934452323, + "grad_norm": 1.2542479570603602, "learning_rate": 5.48589414751638e-08, - "loss": 0.1654, + "loss": 0.1693, "step": 6965 }, { "epoch": 0.9529411764705882, - "grad_norm": 1.131187195849194, + "grad_norm": 1.134075960554416, "learning_rate": 5.454195814427021e-08, - "loss": 0.1513, + "loss": 0.1516, "step": 6966 }, { "epoch": 0.953077975376197, - "grad_norm": 1.4317073047117046, + "grad_norm": 1.4182366611230943, "learning_rate": 5.422588823545849e-08, - "loss": 0.228, + "loss": 0.2298, "step": 6967 }, { "epoch": 0.9532147742818058, - "grad_norm": 1.3975485135586392, + "grad_norm": 1.3977654557992312, "learning_rate": 5.391073180710638e-08, - "loss": 0.1711, + "loss": 0.1716, "step": 6968 }, { "epoch": 0.9533515731874145, - "grad_norm": 1.168655182385759, + "grad_norm": 1.1526455717202677, "learning_rate": 5.359648891742342e-08, - "loss": 0.173, + "loss": 0.1741, "step": 6969 }, { "epoch": 0.9534883720930233, - "grad_norm": 1.4063525748809262, + "grad_norm": 1.4058822409689684, "learning_rate": 5.3283159624448745e-08, - "loss": 0.1684, + "loss": 0.1679, "step": 6970 }, { "epoch": 0.953625170998632, - "grad_norm": 1.25219778873685, + "grad_norm": 1.240198875703245, "learning_rate": 5.297074398605606e-08, - "loss": 0.1555, + "loss": 0.1541, "step": 6971 }, { "epoch": 0.9537619699042408, - "grad_norm": 1.0673206301147888, + "grad_norm": 1.0644654786415686, "learning_rate": 5.2659242059946434e-08, - "loss": 0.1516, + "loss": 0.1523, "step": 6972 }, { "epoch": 0.9538987688098495, - "grad_norm": 1.1914371021171946, + "grad_norm": 1.1855026139600178, "learning_rate": 5.2348653903654955e-08, - "loss": 0.1993, + "loss": 0.1999, "step": 6973 }, { "epoch": 0.9540355677154583, - "grad_norm": 1.4007408084189754, + "grad_norm": 1.4181827669846379, "learning_rate": 5.203897957454629e-08, - "loss": 0.218, + "loss": 0.2201, "step": 6974 }, { "epoch": 0.9541723666210671, - "grad_norm": 1.1426946317426745, + "grad_norm": 1.1617487998624996, "learning_rate": 5.173021912981802e-08, - "loss": 0.1628, + "loss": 0.1636, "step": 6975 }, { "epoch": 0.9543091655266758, - "grad_norm": 1.2602818199796617, + "grad_norm": 1.2455481342365016, "learning_rate": 5.1422372626497293e-08, - "loss": 0.1624, + "loss": 0.1629, "step": 6976 }, { "epoch": 0.9544459644322846, - "grad_norm": 1.1908385447257435, + "grad_norm": 1.1871093195210527, "learning_rate": 5.11154401214431e-08, - "loss": 0.1661, + "loss": 0.1662, "step": 6977 }, { "epoch": 0.9545827633378933, - "grad_norm": 1.2336609498484623, + "grad_norm": 1.225701442003385, "learning_rate": 5.080942167134617e-08, - "loss": 0.1762, + "loss": 0.1753, "step": 6978 }, { "epoch": 0.9547195622435021, - "grad_norm": 1.3134949347490532, + "grad_norm": 1.30786510918235, "learning_rate": 5.050431733272687e-08, - "loss": 0.1776, + "loss": 0.176, "step": 6979 }, { "epoch": 0.9548563611491108, - "grad_norm": 1.5785900167777227, + "grad_norm": 1.5665770362455147, "learning_rate": 5.0200127161939005e-08, - "loss": 0.2068, + "loss": 0.2067, "step": 6980 }, { "epoch": 0.9549931600547196, - "grad_norm": 1.1626133507674024, + "grad_norm": 1.1574669638889907, "learning_rate": 4.9896851215164853e-08, - "loss": 0.1557, + "loss": 0.1576, "step": 6981 }, { "epoch": 0.9551299589603283, - "grad_norm": 1.3289699268719908, + "grad_norm": 1.2962437318160893, "learning_rate": 4.959448954842072e-08, - "loss": 0.1747, + "loss": 0.1762, "step": 6982 }, { "epoch": 0.9552667578659371, - "grad_norm": 1.2806741541161522, + "grad_norm": 1.2490657291592855, "learning_rate": 4.9293042217551377e-08, - "loss": 0.1679, + "loss": 0.1698, "step": 6983 }, { "epoch": 0.9554035567715459, - "grad_norm": 1.3458982201922232, + "grad_norm": 1.3529543377215496, "learning_rate": 4.899250927823396e-08, - "loss": 0.1975, + "loss": 0.1959, "step": 6984 }, { "epoch": 0.9555403556771546, - "grad_norm": 1.3157136100819573, + "grad_norm": 1.306251246958594, "learning_rate": 4.869289078597794e-08, - "loss": 0.1605, + "loss": 0.1615, "step": 6985 }, { "epoch": 0.9556771545827634, - "grad_norm": 1.2702704505493527, + "grad_norm": 1.2484707162946473, "learning_rate": 4.839418679612074e-08, - "loss": 0.194, + "loss": 0.192, "step": 6986 }, { "epoch": 0.9558139534883721, - "grad_norm": 1.363256337710405, + "grad_norm": 1.3396149831901427, "learning_rate": 4.8096397363834314e-08, - "loss": 0.2105, + "loss": 0.2108, "step": 6987 }, { "epoch": 0.9559507523939809, - "grad_norm": 1.355416275724853, + "grad_norm": 1.323722853938312, "learning_rate": 4.779952254411913e-08, - "loss": 0.2256, + "loss": 0.2229, "step": 6988 }, { "epoch": 0.9560875512995896, - "grad_norm": 1.4276788654092578, + "grad_norm": 1.407850526790055, "learning_rate": 4.7503562391808535e-08, - "loss": 0.1992, + "loss": 0.1985, "step": 6989 }, { "epoch": 0.9562243502051984, - "grad_norm": 1.445119728416261, + "grad_norm": 1.4274338332554188, "learning_rate": 4.7208516961565474e-08, - "loss": 0.1905, + "loss": 0.1894, "step": 6990 }, { "epoch": 0.9563611491108072, - "grad_norm": 1.3660638108564922, + "grad_norm": 1.3600510263727077, "learning_rate": 4.6914386307884694e-08, - "loss": 0.1562, + "loss": 0.1556, "step": 6991 }, { "epoch": 0.9564979480164159, - "grad_norm": 1.3212808822225164, + "grad_norm": 1.331395337076183, "learning_rate": 4.662117048509218e-08, - "loss": 0.2092, + "loss": 0.2098, "step": 6992 }, { "epoch": 0.9566347469220247, - "grad_norm": 1.3742728335420664, + "grad_norm": 1.3751713727225439, "learning_rate": 4.632886954734406e-08, - "loss": 0.1655, + "loss": 0.1663, "step": 6993 }, { "epoch": 0.9567715458276334, - "grad_norm": 1.1727928035474415, + "grad_norm": 1.1538112988708447, "learning_rate": 4.60374835486288e-08, - "loss": 0.1688, + "loss": 0.1682, "step": 6994 }, { "epoch": 0.9569083447332422, - "grad_norm": 1.2251832283341308, + "grad_norm": 1.2176061424097617, "learning_rate": 4.574701254276504e-08, - "loss": 0.1566, + "loss": 0.1582, "step": 6995 }, { "epoch": 0.9570451436388508, - "grad_norm": 1.1910431189491992, + "grad_norm": 1.1796374508556524, "learning_rate": 4.5457456583402057e-08, - "loss": 0.1537, + "loss": 0.1529, "step": 6996 }, { "epoch": 0.9571819425444597, - "grad_norm": 1.0933692322175195, + "grad_norm": 1.0803319025371858, "learning_rate": 4.516881572402099e-08, - "loss": 0.1533, + "loss": 0.1532, "step": 6997 }, { "epoch": 0.9573187414500683, - "grad_norm": 1.2689309800324546, + "grad_norm": 1.2575286231017588, "learning_rate": 4.488109001793361e-08, - "loss": 0.1593, + "loss": 0.1582, "step": 6998 }, { "epoch": 0.9574555403556771, - "grad_norm": 1.2099672727721142, + "grad_norm": 1.2097327039512578, "learning_rate": 4.4594279518282435e-08, - "loss": 0.1913, + "loss": 0.1911, "step": 6999 }, { "epoch": 0.957592339261286, - "grad_norm": 1.3110579416087822, + "grad_norm": 1.2859978392291265, "learning_rate": 4.4308384278041184e-08, - "loss": 0.1843, + "loss": 0.183, "step": 7000 }, { "epoch": 0.957592339261286, - "eval_loss": 0.1715097427368164, - "eval_runtime": 5.9155, - "eval_samples_per_second": 5.071, - "eval_steps_per_second": 1.352, + "eval_loss": 0.17197522521018982, + "eval_runtime": 5.9124, + "eval_samples_per_second": 5.074, + "eval_steps_per_second": 1.353, "step": 7000 }, { "epoch": 0.9577291381668946, - "grad_norm": 1.3903526353441744, + "grad_norm": 1.38484125323396, "learning_rate": 4.4023404350014845e-08, - "loss": 0.1872, + "loss": 0.1879, "step": 7001 }, { "epoch": 0.9578659370725034, - "grad_norm": 1.3117173602626238, + "grad_norm": 1.2941045952850496, "learning_rate": 4.373933978683797e-08, - "loss": 0.1524, + "loss": 0.1533, "step": 7002 }, { "epoch": 0.9580027359781121, - "grad_norm": 1.2622659741766513, + "grad_norm": 1.262051032172477, "learning_rate": 4.3456190640978614e-08, - "loss": 0.18, + "loss": 0.1795, "step": 7003 }, { "epoch": 0.958139534883721, - "grad_norm": 1.2650974486667983, + "grad_norm": 1.2492562908344471, "learning_rate": 4.3173956964732145e-08, - "loss": 0.1687, + "loss": 0.1684, "step": 7004 }, { "epoch": 0.9582763337893296, - "grad_norm": 1.511989189297979, + "grad_norm": 1.449068831151954, "learning_rate": 4.289263881022909e-08, - "loss": 0.2047, + "loss": 0.2034, "step": 7005 }, { "epoch": 0.9584131326949384, - "grad_norm": 1.2467596558509164, + "grad_norm": 1.241061966492789, "learning_rate": 4.261223622942678e-08, - "loss": 0.179, + "loss": 0.1786, "step": 7006 }, { "epoch": 0.9585499316005472, - "grad_norm": 1.1784561563356544, + "grad_norm": 1.1676398751466517, "learning_rate": 4.2332749274116566e-08, - "loss": 0.1708, + "loss": 0.1721, "step": 7007 }, { "epoch": 0.9586867305061559, - "grad_norm": 1.1441998929171746, + "grad_norm": 1.1613934768712857, "learning_rate": 4.205417799591938e-08, - "loss": 0.1604, + "loss": 0.1614, "step": 7008 }, { "epoch": 0.9588235294117647, - "grad_norm": 1.283399086142696, + "grad_norm": 1.2755505948968076, "learning_rate": 4.177652244628627e-08, - "loss": 0.2251, + "loss": 0.2274, "step": 7009 }, { "epoch": 0.9589603283173734, - "grad_norm": 1.269980193940837, + "grad_norm": 1.2446006684334465, "learning_rate": 4.1499782676500675e-08, - "loss": 0.1678, + "loss": 0.1688, "step": 7010 }, { "epoch": 0.9590971272229822, - "grad_norm": 0.964147216876391, + "grad_norm": 0.9722828613122974, "learning_rate": 4.122395873767615e-08, - "loss": 0.1636, + "loss": 0.1632, "step": 7011 }, { "epoch": 0.9592339261285909, - "grad_norm": 1.3506747158098353, + "grad_norm": 1.3339312988698104, "learning_rate": 4.094905068075694e-08, - "loss": 0.1762, + "loss": 0.1796, "step": 7012 }, { "epoch": 0.9593707250341997, - "grad_norm": 1.4368071730907648, + "grad_norm": 1.4276430751429376, "learning_rate": 4.0675058556518544e-08, - "loss": 0.2242, + "loss": 0.2274, "step": 7013 }, { "epoch": 0.9595075239398084, - "grad_norm": 1.248959406020274, + "grad_norm": 1.2351920628613768, "learning_rate": 4.04019824155677e-08, - "loss": 0.1494, + "loss": 0.1499, "step": 7014 }, { "epoch": 0.9596443228454172, - "grad_norm": 1.3349377795020776, + "grad_norm": 1.329609189345415, "learning_rate": 4.012982230833962e-08, - "loss": 0.1596, + "loss": 0.1608, "step": 7015 }, { "epoch": 0.959781121751026, - "grad_norm": 1.153579490088526, + "grad_norm": 1.1425940031932351, "learning_rate": 3.985857828510353e-08, - "loss": 0.1961, + "loss": 0.1967, "step": 7016 }, { "epoch": 0.9599179206566347, - "grad_norm": 1.1686564351464597, + "grad_norm": 1.166655162465769, "learning_rate": 3.9588250395957705e-08, - "loss": 0.1578, + "loss": 0.159, "step": 7017 }, { "epoch": 0.9600547195622435, - "grad_norm": 1.5785384619133074, + "grad_norm": 1.5703028017343947, "learning_rate": 3.931883869083108e-08, - "loss": 0.2247, + "loss": 0.2277, "step": 7018 }, { "epoch": 0.9601915184678522, - "grad_norm": 1.41505921834753, + "grad_norm": 1.4038675057977121, "learning_rate": 3.9050343219484424e-08, - "loss": 0.1898, + "loss": 0.1896, "step": 7019 }, { "epoch": 0.960328317373461, - "grad_norm": 1.3483855543452334, + "grad_norm": 1.3440928002257275, "learning_rate": 3.878276403150749e-08, - "loss": 0.1758, + "loss": 0.1756, "step": 7020 }, { "epoch": 0.9604651162790697, - "grad_norm": 1.2072655862781807, + "grad_norm": 1.1894943748180629, "learning_rate": 3.851610117632354e-08, - "loss": 0.1589, + "loss": 0.1585, "step": 7021 }, { "epoch": 0.9606019151846785, - "grad_norm": 1.1718695558315233, + "grad_norm": 1.1588485101334722, "learning_rate": 3.825035470318317e-08, - "loss": 0.1653, + "loss": 0.1658, "step": 7022 }, { "epoch": 0.9607387140902873, - "grad_norm": 1.249308521046988, + "grad_norm": 1.230414627612926, "learning_rate": 3.7985524661171e-08, - "loss": 0.1659, + "loss": 0.1667, "step": 7023 }, { "epoch": 0.960875512995896, - "grad_norm": 1.4377812133217485, + "grad_norm": 1.4368901840655695, "learning_rate": 3.77216110992007e-08, - "loss": 0.2211, + "loss": 0.2209, "step": 7024 }, { "epoch": 0.9610123119015048, - "grad_norm": 1.303103944577441, + "grad_norm": 1.2801057649039793, "learning_rate": 3.745861406601603e-08, - "loss": 0.1909, + "loss": 0.1871, "step": 7025 }, { "epoch": 0.9611491108071135, - "grad_norm": 1.6006096083367027, + "grad_norm": 1.599557419216324, "learning_rate": 3.719653361019315e-08, - "loss": 0.1911, + "loss": 0.192, "step": 7026 }, { "epoch": 0.9612859097127223, - "grad_norm": 1.1043997141761033, + "grad_norm": 1.1055342399492685, "learning_rate": 3.693536978013779e-08, - "loss": 0.1211, + "loss": 0.1217, "step": 7027 }, { "epoch": 0.961422708618331, - "grad_norm": 1.4086277842784427, + "grad_norm": 1.393100239717919, "learning_rate": 3.667512262408746e-08, - "loss": 0.2, + "loss": 0.1991, "step": 7028 }, { "epoch": 0.9615595075239398, - "grad_norm": 1.2126520395639186, + "grad_norm": 1.1969864018935232, "learning_rate": 3.6415792190108154e-08, - "loss": 0.1641, + "loss": 0.1619, "step": 7029 }, { "epoch": 0.9616963064295485, - "grad_norm": 1.0806176771202098, + "grad_norm": 1.0702713109239206, "learning_rate": 3.615737852609935e-08, - "loss": 0.1648, + "loss": 0.1647, "step": 7030 }, { "epoch": 0.9618331053351573, - "grad_norm": 1.3617989216202604, + "grad_norm": 1.3642752079437255, "learning_rate": 3.589988167978842e-08, - "loss": 0.1812, + "loss": 0.1818, "step": 7031 }, { "epoch": 0.9619699042407661, - "grad_norm": 1.5104743489798855, + "grad_norm": 1.4921219195092543, "learning_rate": 3.5643301698736196e-08, - "loss": 0.1478, + "loss": 0.1467, "step": 7032 }, { "epoch": 0.9621067031463748, - "grad_norm": 1.2430294123308443, + "grad_norm": 1.2543084624063712, "learning_rate": 3.538763863033201e-08, - "loss": 0.1838, + "loss": 0.1846, "step": 7033 }, { "epoch": 0.9622435020519836, - "grad_norm": 1.3587721277475346, + "grad_norm": 1.331185123686153, "learning_rate": 3.513289252179697e-08, - "loss": 0.196, + "loss": 0.1969, "step": 7034 }, { "epoch": 0.9623803009575923, - "grad_norm": 1.510579987037674, + "grad_norm": 1.5020704131134373, "learning_rate": 3.487906342018232e-08, - "loss": 0.1973, + "loss": 0.1999, "step": 7035 }, { "epoch": 0.9625170998632011, - "grad_norm": 1.2756442455285322, + "grad_norm": 1.267108337415835, "learning_rate": 3.462615137237002e-08, - "loss": 0.1681, + "loss": 0.1667, "step": 7036 }, { "epoch": 0.9626538987688098, - "grad_norm": 1.1639596708610676, + "grad_norm": 1.1670690174818374, "learning_rate": 3.4374156425073245e-08, - "loss": 0.1775, + "loss": 0.1774, "step": 7037 }, { "epoch": 0.9627906976744186, - "grad_norm": 1.2465776386385854, + "grad_norm": 1.216979235620091, "learning_rate": 3.4123078624834214e-08, "loss": 0.1497, "step": 7038 }, { "epoch": 0.9629274965800274, - "grad_norm": 1.3241216883764089, + "grad_norm": 1.314957253018455, "learning_rate": 3.387291801802805e-08, - "loss": 0.2072, + "loss": 0.2088, "step": 7039 }, { "epoch": 0.9630642954856361, - "grad_norm": 1.2009372770097275, + "grad_norm": 1.1752572672279948, "learning_rate": 3.3623674650857806e-08, - "loss": 0.19, + "loss": 0.1882, "step": 7040 }, { "epoch": 0.9632010943912449, - "grad_norm": 1.1571420229231055, + "grad_norm": 1.1426332800013264, "learning_rate": 3.337534856935998e-08, - "loss": 0.1619, + "loss": 0.162, "step": 7041 }, { "epoch": 0.9633378932968536, - "grad_norm": 1.2727895288036695, + "grad_norm": 1.2443232746255475, "learning_rate": 3.3127939819399544e-08, - "loss": 0.1781, + "loss": 0.1769, "step": 7042 }, { "epoch": 0.9634746922024624, - "grad_norm": 1.381097100026678, + "grad_norm": 1.3570360283034657, "learning_rate": 3.288144844667218e-08, - "loss": 0.1596, + "loss": 0.1595, "step": 7043 }, { "epoch": 0.9636114911080711, - "grad_norm": 1.1965700464760234, + "grad_norm": 1.1709037942845688, "learning_rate": 3.2635874496705356e-08, - "loss": 0.1499, + "loss": 0.1498, "step": 7044 }, { "epoch": 0.9637482900136799, - "grad_norm": 1.2188632961485508, + "grad_norm": 1.2219322391284373, "learning_rate": 3.2391218014856116e-08, - "loss": 0.2035, + "loss": 0.2038, "step": 7045 }, { "epoch": 0.9638850889192886, - "grad_norm": 1.2259581761302103, + "grad_norm": 1.220930514604199, "learning_rate": 3.214747904631277e-08, - "loss": 0.1702, + "loss": 0.1703, "step": 7046 }, { "epoch": 0.9640218878248974, - "grad_norm": 1.1453063378659298, + "grad_norm": 1.1364267906930499, "learning_rate": 3.1904657636092627e-08, - "loss": 0.1511, + "loss": 0.1509, "step": 7047 }, { "epoch": 0.9641586867305062, - "grad_norm": 1.0308376886670694, + "grad_norm": 1.016627909502515, "learning_rate": 3.1662753829045376e-08, - "loss": 0.1524, + "loss": 0.1534, "step": 7048 }, { "epoch": 0.9642954856361149, - "grad_norm": 1.1908881306851917, + "grad_norm": 1.1787969887359084, "learning_rate": 3.142176766985083e-08, - "loss": 0.1891, + "loss": 0.1887, "step": 7049 }, { "epoch": 0.9644322845417237, - "grad_norm": 1.2628034172130203, + "grad_norm": 1.258869574529316, "learning_rate": 3.1181699203018386e-08, - "loss": 0.182, + "loss": 0.1827, "step": 7050 }, { "epoch": 0.9645690834473324, - "grad_norm": 1.3457980114272274, + "grad_norm": 1.2938645427024869, "learning_rate": 3.0942548472888134e-08, - "loss": 0.2187, + "loss": 0.218, "step": 7051 }, { "epoch": 0.9647058823529412, - "grad_norm": 1.2374136851835886, + "grad_norm": 1.2381449032126888, "learning_rate": 3.0704315523631956e-08, - "loss": 0.1678, + "loss": 0.1688, "step": 7052 }, { "epoch": 0.9648426812585499, - "grad_norm": 1.593224848820469, + "grad_norm": 1.5735158606112698, "learning_rate": 3.0467000399250215e-08, - "loss": 0.2181, + "loss": 0.2188, "step": 7053 }, { "epoch": 0.9649794801641587, - "grad_norm": 1.2644034858492692, + "grad_norm": 1.2425343984773152, "learning_rate": 3.023060314357562e-08, - "loss": 0.1666, + "loss": 0.1667, "step": 7054 }, { "epoch": 0.9651162790697675, - "grad_norm": 1.0367330144436377, + "grad_norm": 1.025754859296156, "learning_rate": 2.9995123800270476e-08, - "loss": 0.156, + "loss": 0.1553, "step": 7055 }, { "epoch": 0.9652530779753762, - "grad_norm": 1.1449783828268998, + "grad_norm": 1.1303928179269738, "learning_rate": 2.976056241282721e-08, - "loss": 0.1754, + "loss": 0.1755, "step": 7056 }, { "epoch": 0.965389876880985, - "grad_norm": 1.178911791041809, + "grad_norm": 1.1514565336056972, "learning_rate": 2.9526919024569502e-08, - "loss": 0.1598, + "loss": 0.1582, "step": 7057 }, { "epoch": 0.9655266757865937, - "grad_norm": 1.121609279336894, + "grad_norm": 1.1088292137897287, "learning_rate": 2.929419367865116e-08, - "loss": 0.1572, + "loss": 0.1562, "step": 7058 }, { "epoch": 0.9656634746922025, - "grad_norm": 1.025951178179245, + "grad_norm": 1.031710173530842, "learning_rate": 2.906238641805614e-08, - "loss": 0.1435, + "loss": 0.1445, "step": 7059 }, { "epoch": 0.9658002735978112, - "grad_norm": 1.479137059427167, + "grad_norm": 1.4880594316464668, "learning_rate": 2.8831497285599087e-08, - "loss": 0.2023, + "loss": 0.204, "step": 7060 }, { "epoch": 0.96593707250342, - "grad_norm": 1.6732777556027376, + "grad_norm": 1.6334693031806073, "learning_rate": 2.8601526323924766e-08, - "loss": 0.2607, + "loss": 0.258, "step": 7061 }, { "epoch": 0.9660738714090287, - "grad_norm": 1.053595555621696, + "grad_norm": 1.0643823869698974, "learning_rate": 2.8372473575509207e-08, - "loss": 0.1578, + "loss": 0.1583, "step": 7062 }, { "epoch": 0.9662106703146375, - "grad_norm": 1.2792636916543343, + "grad_norm": 1.2914727793177407, "learning_rate": 2.8144339082657458e-08, - "loss": 0.1813, + "loss": 0.1833, "step": 7063 }, { "epoch": 0.9663474692202463, - "grad_norm": 1.3082625623507445, + "grad_norm": 1.3018965563579072, "learning_rate": 2.7917122887506366e-08, - "loss": 0.1771, + "loss": 0.1759, "step": 7064 }, { "epoch": 0.966484268125855, - "grad_norm": 1.1187113693462105, + "grad_norm": 1.1078103652087832, "learning_rate": 2.7690825032022918e-08, "loss": 0.1671, "step": 7065 }, { "epoch": 0.9666210670314638, - "grad_norm": 1.1614500081516663, + "grad_norm": 1.15675112480492, "learning_rate": 2.7465445558003678e-08, - "loss": 0.1711, + "loss": 0.1719, "step": 7066 }, { "epoch": 0.9667578659370725, - "grad_norm": 1.0536482327077976, + "grad_norm": 1.0423889440681273, "learning_rate": 2.7240984507075907e-08, - "loss": 0.1651, + "loss": 0.1655, "step": 7067 }, { "epoch": 0.9668946648426813, - "grad_norm": 1.3093181151678046, + "grad_norm": 1.3016438265757813, "learning_rate": 2.701744192069755e-08, - "loss": 0.1921, + "loss": 0.1936, "step": 7068 }, { "epoch": 0.96703146374829, - "grad_norm": 1.3195606219385376, + "grad_norm": 1.3164212049913697, "learning_rate": 2.6794817840156695e-08, - "loss": 0.21, + "loss": 0.212, "step": 7069 }, { "epoch": 0.9671682626538988, - "grad_norm": 1.037211940959381, + "grad_norm": 1.0286888870245718, "learning_rate": 2.6573112306572112e-08, - "loss": 0.1574, + "loss": 0.1571, "step": 7070 }, { "epoch": 0.9673050615595076, - "grad_norm": 1.4587316811756506, + "grad_norm": 1.4414628608502802, "learning_rate": 2.6352325360892162e-08, - "loss": 0.2029, + "loss": 0.2033, "step": 7071 }, { "epoch": 0.9674418604651163, - "grad_norm": 1.220419825265644, + "grad_norm": 1.21538839188533, "learning_rate": 2.6132457043896442e-08, - "loss": 0.1664, + "loss": 0.166, "step": 7072 }, { "epoch": 0.9675786593707251, - "grad_norm": 1.3117810767473126, + "grad_norm": 1.3165985851984103, "learning_rate": 2.5913507396194137e-08, - "loss": 0.1667, + "loss": 0.167, "step": 7073 }, { "epoch": 0.9677154582763338, - "grad_norm": 1.2820611496733068, + "grad_norm": 1.282849905552777, "learning_rate": 2.569547645822512e-08, - "loss": 0.1671, + "loss": 0.1682, "step": 7074 }, { "epoch": 0.9678522571819426, - "grad_norm": 1.4771167843626567, + "grad_norm": 1.4050173136000874, "learning_rate": 2.547836427025996e-08, - "loss": 0.1733, + "loss": 0.1693, "step": 7075 }, { "epoch": 0.9679890560875513, - "grad_norm": 1.5952623134782653, + "grad_norm": 1.5761845410093245, "learning_rate": 2.5262170872398794e-08, - "loss": 0.212, + "loss": 0.2133, "step": 7076 }, { "epoch": 0.9681258549931601, - "grad_norm": 1.2162482904788545, + "grad_norm": 1.2026970981699414, "learning_rate": 2.504689630457191e-08, - "loss": 0.1572, + "loss": 0.1561, "step": 7077 }, { "epoch": 0.9682626538987688, - "grad_norm": 1.1592598286397244, + "grad_norm": 1.1377614953569604, "learning_rate": 2.4832540606540835e-08, - "loss": 0.2083, + "loss": 0.2063, "step": 7078 }, { "epoch": 0.9683994528043776, - "grad_norm": 1.3355100854455064, + "grad_norm": 1.3338439621758478, "learning_rate": 2.461910381789667e-08, - "loss": 0.1989, + "loss": 0.1992, "step": 7079 }, { "epoch": 0.9685362517099864, - "grad_norm": 1.356945541717656, + "grad_norm": 1.4152357684350105, "learning_rate": 2.4406585978061782e-08, - "loss": 0.1696, + "loss": 0.1693, "step": 7080 }, { "epoch": 0.968673050615595, - "grad_norm": 1.112167670108161, + "grad_norm": 1.0986296984887034, "learning_rate": 2.4194987126286983e-08, - "loss": 0.1731, + "loss": 0.1728, "step": 7081 }, { "epoch": 0.9688098495212039, - "grad_norm": 0.9149801166321083, + "grad_norm": 0.9229180219304013, "learning_rate": 2.398430730165491e-08, - "loss": 0.1226, + "loss": 0.1253, "step": 7082 }, { "epoch": 0.9689466484268126, - "grad_norm": 1.3984768970965806, + "grad_norm": 1.3888789970458735, "learning_rate": 2.377454654307776e-08, - "loss": 0.1761, + "loss": 0.175, "step": 7083 }, { "epoch": 0.9690834473324214, - "grad_norm": 1.2764942663171677, + "grad_norm": 1.263244409188764, "learning_rate": 2.3565704889298434e-08, - "loss": 0.1631, + "loss": 0.1634, "step": 7084 }, { "epoch": 0.96922024623803, - "grad_norm": 1.1930083714543676, + "grad_norm": 1.2676074709796443, "learning_rate": 2.3357782378889414e-08, - "loss": 0.1515, + "loss": 0.154, "step": 7085 }, { "epoch": 0.9693570451436389, - "grad_norm": 1.455157696911366, + "grad_norm": 1.4285126086315187, "learning_rate": 2.3150779050254424e-08, - "loss": 0.2165, + "loss": 0.2156, "step": 7086 }, { "epoch": 0.9694938440492477, - "grad_norm": 0.9897381899863402, + "grad_norm": 0.990726039604944, "learning_rate": 2.294469494162621e-08, - "loss": 0.1384, + "loss": 0.1395, "step": 7087 }, { "epoch": 0.9696306429548563, - "grad_norm": 1.3294185919533144, + "grad_norm": 1.3142495343413372, "learning_rate": 2.273953009106933e-08, - "loss": 0.2097, + "loss": 0.2083, "step": 7088 }, { "epoch": 0.9697674418604652, - "grad_norm": 1.3018457194722601, + "grad_norm": 1.2931423173979362, "learning_rate": 2.2535284536476242e-08, - "loss": 0.1636, + "loss": 0.1628, "step": 7089 }, { "epoch": 0.9699042407660738, - "grad_norm": 1.4964332091453112, + "grad_norm": 1.4830514061383047, "learning_rate": 2.2331958315571777e-08, - "loss": 0.1873, + "loss": 0.1893, "step": 7090 }, { "epoch": 0.9700410396716826, - "grad_norm": 1.118903888704933, + "grad_norm": 1.1146963704619477, "learning_rate": 2.2129551465909228e-08, - "loss": 0.1446, + "loss": 0.1449, "step": 7091 }, { "epoch": 0.9701778385772913, - "grad_norm": 1.3658204724653853, + "grad_norm": 1.3633243125313876, "learning_rate": 2.1928064024874797e-08, - "loss": 0.2369, + "loss": 0.2348, "step": 7092 }, { "epoch": 0.9703146374829001, - "grad_norm": 1.3094586847380718, + "grad_norm": 1.2731415971671787, "learning_rate": 2.172749602968094e-08, - "loss": 0.1653, + "loss": 0.1651, "step": 7093 }, { "epoch": 0.9704514363885088, - "grad_norm": 1.1167437903717579, + "grad_norm": 1.1170694236180925, "learning_rate": 2.1527847517373577e-08, - "loss": 0.1385, + "loss": 0.1393, "step": 7094 }, { "epoch": 0.9705882352941176, - "grad_norm": 1.2755018641835647, + "grad_norm": 1.2749762381869039, "learning_rate": 2.1329118524827662e-08, - "loss": 0.2046, + "loss": 0.2045, "step": 7095 }, { "epoch": 0.9707250341997264, - "grad_norm": 0.9040007542580593, + "grad_norm": 0.8934194136008463, "learning_rate": 2.113130908874772e-08, - "loss": 0.1514, + "loss": 0.1508, "step": 7096 }, { "epoch": 0.9708618331053351, - "grad_norm": 1.2303619652892621, + "grad_norm": 1.219162409230932, "learning_rate": 2.0934419245668968e-08, - "loss": 0.1725, + "loss": 0.1727, "step": 7097 }, { "epoch": 0.9709986320109439, - "grad_norm": 1.212832116420827, + "grad_norm": 1.1730801500541028, "learning_rate": 2.0738449031957876e-08, - "loss": 0.1808, + "loss": 0.18, "step": 7098 }, { "epoch": 0.9711354309165526, - "grad_norm": 1.3430947396328248, + "grad_norm": 1.3181674879354597, "learning_rate": 2.054339848380882e-08, - "loss": 0.1934, + "loss": 0.1903, "step": 7099 }, { "epoch": 0.9712722298221614, - "grad_norm": 1.342373543570756, + "grad_norm": 1.336428281475804, "learning_rate": 2.034926763724798e-08, - "loss": 0.1725, + "loss": 0.1727, "step": 7100 }, { "epoch": 0.9712722298221614, - "eval_loss": 0.1715807169675827, - "eval_runtime": 5.9224, - "eval_samples_per_second": 5.065, - "eval_steps_per_second": 1.351, + "eval_loss": 0.1718672811985016, + "eval_runtime": 5.9242, + "eval_samples_per_second": 5.064, + "eval_steps_per_second": 1.35, "step": 7100 }, { "epoch": 0.9714090287277701, - "grad_norm": 1.3988701598282238, + "grad_norm": 1.3901393615840776, "learning_rate": 2.015605652813113e-08, - "loss": 0.1631, + "loss": 0.1634, "step": 7101 }, { "epoch": 0.9715458276333789, - "grad_norm": 1.152636753343916, + "grad_norm": 1.1521446625440785, "learning_rate": 1.9963765192144157e-08, - "loss": 0.1482, + "loss": 0.1483, "step": 7102 }, { "epoch": 0.9716826265389877, - "grad_norm": 1.2696056361245218, + "grad_norm": 1.2455039746781311, "learning_rate": 1.9772393664802546e-08, - "loss": 0.1688, + "loss": 0.1691, "step": 7103 }, { "epoch": 0.9718194254445964, - "grad_norm": 1.43969944742197, + "grad_norm": 1.4224949935043711, "learning_rate": 1.958194198145358e-08, - "loss": 0.2039, + "loss": 0.2042, "step": 7104 }, { "epoch": 0.9719562243502052, - "grad_norm": 1.414796997072116, + "grad_norm": 1.4030452566652762, "learning_rate": 1.9392410177273002e-08, - "loss": 0.1933, + "loss": 0.1944, "step": 7105 }, { "epoch": 0.9720930232558139, - "grad_norm": 1.0562245825679077, + "grad_norm": 1.0366236915551204, "learning_rate": 1.920379828726726e-08, - "loss": 0.158, + "loss": 0.1582, "step": 7106 }, { "epoch": 0.9722298221614227, - "grad_norm": 1.4669491550406009, + "grad_norm": 1.4513485823635437, "learning_rate": 1.901610634627238e-08, - "loss": 0.1969, + "loss": 0.1952, "step": 7107 }, { "epoch": 0.9723666210670314, - "grad_norm": 1.174219724331159, + "grad_norm": 1.155055297978947, "learning_rate": 1.8829334388955068e-08, - "loss": 0.1958, + "loss": 0.1959, "step": 7108 }, { "epoch": 0.9725034199726402, - "grad_norm": 1.2913057544409425, + "grad_norm": 1.271075307385731, "learning_rate": 1.8643482449812176e-08, - "loss": 0.1617, + "loss": 0.1614, "step": 7109 }, { "epoch": 0.9726402188782489, - "grad_norm": 1.212046104955589, + "grad_norm": 1.2078330214689967, "learning_rate": 1.8458550563170696e-08, - "loss": 0.1631, + "loss": 0.1623, "step": 7110 }, { "epoch": 0.9727770177838577, - "grad_norm": 1.29421449885534, + "grad_norm": 1.277537034046066, "learning_rate": 1.827453876318719e-08, - "loss": 0.1813, + "loss": 0.1817, "step": 7111 }, { "epoch": 0.9729138166894665, - "grad_norm": 1.1165195644464005, + "grad_norm": 1.110189652848596, "learning_rate": 1.80914470838478e-08, - "loss": 0.1591, + "loss": 0.1588, "step": 7112 }, { "epoch": 0.9730506155950752, - "grad_norm": 1.296767299175839, + "grad_norm": 1.279749186974372, "learning_rate": 1.790927555896993e-08, - "loss": 0.181, + "loss": 0.1814, "step": 7113 }, { "epoch": 0.973187414500684, - "grad_norm": 1.2886879790088344, + "grad_norm": 1.302359772111818, "learning_rate": 1.77280242222011e-08, - "loss": 0.1962, + "loss": 0.1942, "step": 7114 }, { "epoch": 0.9733242134062927, - "grad_norm": 1.2241472400466569, + "grad_norm": 1.2273595509257456, "learning_rate": 1.7547693107017316e-08, - "loss": 0.138, + "loss": 0.1378, "step": 7115 }, { "epoch": 0.9734610123119015, - "grad_norm": 1.5535446699263977, + "grad_norm": 1.539088273059537, "learning_rate": 1.7368282246726375e-08, - "loss": 0.1838, + "loss": 0.1832, "step": 7116 }, { "epoch": 0.9735978112175102, - "grad_norm": 1.2499512388557048, + "grad_norm": 1.2290369368599032, "learning_rate": 1.7189791674464553e-08, - "loss": 0.1823, + "loss": 0.1807, "step": 7117 }, { "epoch": 0.973734610123119, - "grad_norm": 1.1433732517121002, + "grad_norm": 1.1440964126796769, "learning_rate": 1.701222142319936e-08, - "loss": 0.1637, + "loss": 0.1636, "step": 7118 }, { "epoch": 0.9738714090287278, - "grad_norm": 1.3079601730001356, + "grad_norm": 1.3095268430736353, "learning_rate": 1.68355715257279e-08, - "loss": 0.2032, + "loss": 0.2036, "step": 7119 }, { "epoch": 0.9740082079343365, - "grad_norm": 1.3103650200647752, + "grad_norm": 1.3089885540487982, "learning_rate": 1.6659842014677408e-08, - "loss": 0.1483, + "loss": 0.1467, "step": 7120 }, { "epoch": 0.9741450068399453, - "grad_norm": 1.2885098426860264, + "grad_norm": 1.2763004833272127, "learning_rate": 1.6485032922504697e-08, - "loss": 0.1995, + "loss": 0.2003, "step": 7121 }, { "epoch": 0.974281805745554, - "grad_norm": 1.3355062278661354, + "grad_norm": 1.3321620287849887, "learning_rate": 1.6311144281496715e-08, - "loss": 0.2055, + "loss": 0.204, "step": 7122 }, { "epoch": 0.9744186046511628, - "grad_norm": 1.2853165228481287, + "grad_norm": 1.2999182782325576, "learning_rate": 1.6138176123770554e-08, - "loss": 0.1883, + "loss": 0.1876, "step": 7123 }, { "epoch": 0.9745554035567715, - "grad_norm": 1.3052954881115606, + "grad_norm": 1.3154019871888347, "learning_rate": 1.596612848127399e-08, - "loss": 0.1705, + "loss": 0.1715, "step": 7124 }, { "epoch": 0.9746922024623803, - "grad_norm": 1.1521612406944977, + "grad_norm": 1.1364710330297747, "learning_rate": 1.579500138578327e-08, - "loss": 0.1657, + "loss": 0.1651, "step": 7125 }, { "epoch": 0.974829001367989, - "grad_norm": 1.3016233345068566, + "grad_norm": 1.282877734427461, "learning_rate": 1.562479486890589e-08, - "loss": 0.2027, + "loss": 0.2042, "step": 7126 }, { "epoch": 0.9749658002735978, - "grad_norm": 1.3951193062588272, + "grad_norm": 1.3901609656912786, "learning_rate": 1.5455508962078926e-08, - "loss": 0.1695, + "loss": 0.1684, "step": 7127 }, { "epoch": 0.9751025991792066, - "grad_norm": 1.162615849033044, + "grad_norm": 1.1617217120636825, "learning_rate": 1.5287143696568475e-08, - "loss": 0.1589, + "loss": 0.1592, "step": 7128 }, { "epoch": 0.9752393980848153, - "grad_norm": 1.0856855978039348, + "grad_norm": 1.0781758443840186, "learning_rate": 1.5119699103472995e-08, "loss": 0.1494, "step": 7129 }, { "epoch": 0.9753761969904241, - "grad_norm": 1.2449463935687015, + "grad_norm": 1.234145675408953, "learning_rate": 1.4953175213717753e-08, - "loss": 0.1818, + "loss": 0.1814, "step": 7130 }, { "epoch": 0.9755129958960328, - "grad_norm": 1.5674332914915923, + "grad_norm": 1.8408234099335803, "learning_rate": 1.4787572058060918e-08, - "loss": 0.2061, + "loss": 0.2118, "step": 7131 }, { "epoch": 0.9756497948016416, - "grad_norm": 1.1719240742845007, + "grad_norm": 1.169352447557281, "learning_rate": 1.462288966708858e-08, - "loss": 0.1749, + "loss": 0.1737, "step": 7132 }, { "epoch": 0.9757865937072503, - "grad_norm": 1.4379989146252576, + "grad_norm": 1.4479189721839727, "learning_rate": 1.4459128071216965e-08, - "loss": 0.2131, + "loss": 0.2147, "step": 7133 }, { "epoch": 0.9759233926128591, - "grad_norm": 1.1513971457853243, + "grad_norm": 1.1507316088011301, "learning_rate": 1.42962873006941e-08, - "loss": 0.1698, + "loss": 0.1701, "step": 7134 }, { "epoch": 0.9760601915184679, - "grad_norm": 1.2344623411916287, + "grad_norm": 1.2277504697835095, "learning_rate": 1.4134367385594816e-08, - "loss": 0.157, + "loss": 0.1576, "step": 7135 }, { "epoch": 0.9761969904240766, - "grad_norm": 1.2426802106782857, + "grad_norm": 1.237796242210364, "learning_rate": 1.3973368355827411e-08, - "loss": 0.1789, + "loss": 0.1774, "step": 7136 }, { "epoch": 0.9763337893296854, - "grad_norm": 1.2557274314904867, + "grad_norm": 1.2561296890056617, "learning_rate": 1.3813290241126432e-08, "loss": 0.1994, "step": 7137 }, { "epoch": 0.9764705882352941, - "grad_norm": 1.2813268824893735, + "grad_norm": 1.2680787956752317, "learning_rate": 1.3654133071059894e-08, - "loss": 0.1998, + "loss": 0.1995, "step": 7138 }, { "epoch": 0.9766073871409029, - "grad_norm": 1.069334796719018, + "grad_norm": 1.052185303279875, "learning_rate": 1.3495896875022619e-08, - "loss": 0.1591, + "loss": 0.1576, "step": 7139 }, { "epoch": 0.9767441860465116, - "grad_norm": 1.1219748577150928, + "grad_norm": 1.1168570510998217, "learning_rate": 1.333858168224178e-08, - "loss": 0.1587, + "loss": 0.1581, "step": 7140 }, { "epoch": 0.9768809849521204, - "grad_norm": 1.155944169364042, + "grad_norm": 1.1592464036467416, "learning_rate": 1.3182187521772471e-08, - "loss": 0.1803, + "loss": 0.1793, "step": 7141 }, { "epoch": 0.9770177838577291, - "grad_norm": 1.312796096241015, + "grad_norm": 1.3452974775156419, "learning_rate": 1.302671442250103e-08, - "loss": 0.1969, + "loss": 0.1985, "step": 7142 }, { "epoch": 0.9771545827633379, - "grad_norm": 1.3025682413459152, + "grad_norm": 1.3042216003938296, "learning_rate": 1.2872162413143374e-08, - "loss": 0.1826, + "loss": 0.1838, "step": 7143 }, { "epoch": 0.9772913816689467, - "grad_norm": 1.2156871958093292, + "grad_norm": 1.2128446185153465, "learning_rate": 1.2718531522244448e-08, - "loss": 0.1529, + "loss": 0.1528, "step": 7144 }, { "epoch": 0.9774281805745554, - "grad_norm": 1.3699481726714353, + "grad_norm": 1.375104302219621, "learning_rate": 1.2565821778180998e-08, - "loss": 0.1861, + "loss": 0.1852, "step": 7145 }, { "epoch": 0.9775649794801642, - "grad_norm": 1.4264768368430847, + "grad_norm": 1.429512549462777, "learning_rate": 1.2414033209157128e-08, - "loss": 0.1727, + "loss": 0.176, "step": 7146 }, { "epoch": 0.9777017783857729, - "grad_norm": 1.0764990084222525, + "grad_norm": 1.0806974220470476, "learning_rate": 1.2263165843208192e-08, - "loss": 0.1565, + "loss": 0.1573, "step": 7147 }, { "epoch": 0.9778385772913817, - "grad_norm": 1.1437164638891661, + "grad_norm": 1.1329065073780815, "learning_rate": 1.211321970820023e-08, - "loss": 0.151, + "loss": 0.1512, "step": 7148 }, { "epoch": 0.9779753761969904, - "grad_norm": 1.0135969952737174, + "grad_norm": 0.9973251219051842, "learning_rate": 1.1964194831827203e-08, - "loss": 0.1675, + "loss": 0.1676, "step": 7149 }, { "epoch": 0.9781121751025992, - "grad_norm": 1.1940753020914736, + "grad_norm": 1.1758684165342526, "learning_rate": 1.1816091241614314e-08, - "loss": 0.1665, + "loss": 0.1655, "step": 7150 }, { "epoch": 0.978248974008208, - "grad_norm": 1.2271443204738912, + "grad_norm": 1.2084665215595178, "learning_rate": 1.1668908964916348e-08, - "loss": 0.184, + "loss": 0.1838, "step": 7151 }, { "epoch": 0.9783857729138167, - "grad_norm": 1.235750858225464, + "grad_norm": 1.2315457038680775, "learning_rate": 1.1522648028917116e-08, - "loss": 0.179, + "loss": 0.1819, "step": 7152 }, { "epoch": 0.9785225718194255, - "grad_norm": 1.364362856064292, + "grad_norm": 1.3405062470013704, "learning_rate": 1.1377308460631674e-08, - "loss": 0.1782, + "loss": 0.1808, "step": 7153 }, { "epoch": 0.9786593707250342, - "grad_norm": 1.303632646284448, + "grad_norm": 1.2934044183076125, "learning_rate": 1.1232890286903552e-08, - "loss": 0.1956, + "loss": 0.1955, "step": 7154 }, { "epoch": 0.978796169630643, - "grad_norm": 1.3643170593036649, + "grad_norm": 1.3972043020760436, "learning_rate": 1.1089393534406411e-08, - "loss": 0.1716, + "loss": 0.1741, "step": 7155 }, { "epoch": 0.9789329685362517, - "grad_norm": 1.4978079425863047, + "grad_norm": 1.4535212979527197, "learning_rate": 1.0946818229644607e-08, - "loss": 0.196, + "loss": 0.197, "step": 7156 }, { "epoch": 0.9790697674418605, - "grad_norm": 1.3021804488471893, + "grad_norm": 1.2867321745957947, "learning_rate": 1.0805164398952072e-08, - "loss": 0.1821, + "loss": 0.182, "step": 7157 }, { "epoch": 0.9792065663474692, - "grad_norm": 1.1805695527398126, + "grad_norm": 1.1621417370470781, "learning_rate": 1.0664432068491216e-08, - "loss": 0.1673, + "loss": 0.1664, "step": 7158 }, { "epoch": 0.979343365253078, - "grad_norm": 1.158869050530632, + "grad_norm": 1.1523403152251033, "learning_rate": 1.052462126425513e-08, - "loss": 0.1627, + "loss": 0.1634, "step": 7159 }, { "epoch": 0.9794801641586868, - "grad_norm": 1.177130850820393, + "grad_norm": 1.1697786680814688, "learning_rate": 1.0385732012067606e-08, - "loss": 0.1483, + "loss": 0.1489, "step": 7160 }, { "epoch": 0.9796169630642955, - "grad_norm": 1.1568291868149339, + "grad_norm": 1.1373223570242859, "learning_rate": 1.0247764337581457e-08, - "loss": 0.1653, + "loss": 0.1659, "step": 7161 }, { "epoch": 0.9797537619699043, - "grad_norm": 1.3773203766915634, + "grad_norm": 1.372900718675009, "learning_rate": 1.0110718266277964e-08, - "loss": 0.2074, + "loss": 0.2063, "step": 7162 }, { "epoch": 0.979890560875513, - "grad_norm": 1.4312367571166018, + "grad_norm": 1.4471216450411368, "learning_rate": 9.974593823470214e-09, - "loss": 0.2038, + "loss": 0.202, "step": 7163 }, { "epoch": 0.9800273597811218, - "grad_norm": 1.162805730584535, + "grad_norm": 1.144606802165317, "learning_rate": 9.839391034300316e-09, - "loss": 0.1625, + "loss": 0.1623, "step": 7164 }, { "epoch": 0.9801641586867305, - "grad_norm": 1.0893021514827892, + "grad_norm": 1.07669578149289, "learning_rate": 9.705109923739963e-09, - "loss": 0.1612, + "loss": 0.1592, "step": 7165 }, { "epoch": 0.9803009575923393, - "grad_norm": 1.3290262748494486, + "grad_norm": 1.3128932153436794, "learning_rate": 9.571750516590983e-09, - "loss": 0.1766, + "loss": 0.177, "step": 7166 }, { "epoch": 0.9804377564979481, - "grad_norm": 1.3796308945778832, + "grad_norm": 1.359664628314367, "learning_rate": 9.439312837484783e-09, - "loss": 0.1888, + "loss": 0.1886, "step": 7167 }, { "epoch": 0.9805745554035568, - "grad_norm": 1.1483109493474082, + "grad_norm": 1.1238728943107603, "learning_rate": 9.307796910881794e-09, - "loss": 0.1858, + "loss": 0.1851, "step": 7168 }, { "epoch": 0.9807113543091656, - "grad_norm": 1.3622883304879774, + "grad_norm": 1.3519985260226304, "learning_rate": 9.1772027610737e-09, - "loss": 0.1785, + "loss": 0.1788, "step": 7169 }, { "epoch": 0.9808481532147743, - "grad_norm": 1.2327499009213618, + "grad_norm": 1.2158257315402305, "learning_rate": 9.04753041218065e-09, - "loss": 0.171, + "loss": 0.1706, "step": 7170 }, { "epoch": 0.9809849521203831, - "grad_norm": 1.3151679672468914, + "grad_norm": 1.311496950431564, "learning_rate": 8.918779888153485e-09, - "loss": 0.2091, + "loss": 0.2109, "step": 7171 }, { "epoch": 0.9811217510259918, - "grad_norm": 1.09450055000645, + "grad_norm": 1.0975213186013595, "learning_rate": 8.790951212771515e-09, - "loss": 0.1725, + "loss": 0.1716, "step": 7172 }, { "epoch": 0.9812585499316006, - "grad_norm": 1.2067776555114111, + "grad_norm": 1.199443746411415, "learning_rate": 8.664044409645856e-09, - "loss": 0.1623, + "loss": 0.1647, "step": 7173 }, { "epoch": 0.9813953488372092, - "grad_norm": 1.3826389894058857, + "grad_norm": 1.3606104013582179, "learning_rate": 8.538059502214979e-09, - "loss": 0.233, + "loss": 0.2322, "step": 7174 }, { "epoch": 0.981532147742818, - "grad_norm": 1.1874166496695187, + "grad_norm": 1.1818971408194043, "learning_rate": 8.412996513748607e-09, - "loss": 0.1785, + "loss": 0.1795, "step": 7175 }, { "epoch": 0.9816689466484269, - "grad_norm": 1.3910449512342034, + "grad_norm": 1.388976337118894, "learning_rate": 8.28885546734548e-09, - "loss": 0.1804, + "loss": 0.182, "step": 7176 }, { "epoch": 0.9818057455540355, - "grad_norm": 1.3843726932046025, + "grad_norm": 1.3718554266448788, "learning_rate": 8.165636385935038e-09, - "loss": 0.1837, + "loss": 0.1838, "step": 7177 }, { "epoch": 0.9819425444596444, - "grad_norm": 1.1463450890886833, + "grad_norm": 1.1593221051733986, "learning_rate": 8.043339292275187e-09, - "loss": 0.1483, + "loss": 0.1476, "step": 7178 }, { "epoch": 0.982079343365253, - "grad_norm": 1.4181719686479741, + "grad_norm": 1.3793317465150983, "learning_rate": 7.921964208954525e-09, - "loss": 0.1956, + "loss": 0.1934, "step": 7179 }, { "epoch": 0.9822161422708618, - "grad_norm": 1.2404991295855212, + "grad_norm": 1.2252197891704248, "learning_rate": 7.801511158390118e-09, - "loss": 0.1697, + "loss": 0.1688, "step": 7180 }, { "epoch": 0.9823529411764705, - "grad_norm": 1.2601077122517297, + "grad_norm": 1.2491832138887107, "learning_rate": 7.681980162830283e-09, - "loss": 0.18, + "loss": 0.1811, "step": 7181 }, { "epoch": 0.9824897400820793, - "grad_norm": 1.2324218518653214, + "grad_norm": 1.1812825825300204, "learning_rate": 7.563371244351802e-09, - "loss": 0.1717, + "loss": 0.17, "step": 7182 }, { "epoch": 0.9826265389876881, - "grad_norm": 1.3567952412418172, + "grad_norm": 1.3361305519351327, "learning_rate": 7.445684424862154e-09, - "loss": 0.1969, + "loss": 0.1948, "step": 7183 }, { "epoch": 0.9827633378932968, - "grad_norm": 0.9623772181974392, + "grad_norm": 0.9446576346030899, "learning_rate": 7.3289197260978385e-09, - "loss": 0.1608, + "loss": 0.1599, "step": 7184 }, { "epoch": 0.9829001367989056, - "grad_norm": 1.2598718967846705, + "grad_norm": 1.26503183130939, "learning_rate": 7.213077169625493e-09, - "loss": 0.1667, + "loss": 0.1689, "step": 7185 }, { "epoch": 0.9830369357045143, - "grad_norm": 1.0809746895932884, + "grad_norm": 1.06524547313071, "learning_rate": 7.098156776840226e-09, - "loss": 0.1605, + "loss": 0.1586, "step": 7186 }, { "epoch": 0.9831737346101231, - "grad_norm": 1.0938970071494618, + "grad_norm": 1.0884737545782708, "learning_rate": 6.984158568968391e-09, - "loss": 0.1779, + "loss": 0.178, "step": 7187 }, { "epoch": 0.9833105335157318, - "grad_norm": 1.2168309406950923, + "grad_norm": 1.222168368305401, "learning_rate": 6.871082567065368e-09, - "loss": 0.1584, + "loss": 0.1595, "step": 7188 }, { "epoch": 0.9834473324213406, - "grad_norm": 1.3659718921489772, + "grad_norm": 1.375615780428875, "learning_rate": 6.758928792016117e-09, - "loss": 0.1791, + "loss": 0.1783, "step": 7189 }, { "epoch": 0.9835841313269493, - "grad_norm": 1.1472416783841157, + "grad_norm": 1.138548590981387, "learning_rate": 6.64769726453518e-09, - "loss": 0.1412, + "loss": 0.1417, "step": 7190 }, { "epoch": 0.9837209302325581, - "grad_norm": 1.2992551021315684, + "grad_norm": 1.2807497029398551, "learning_rate": 6.537388005167233e-09, - "loss": 0.1798, + "loss": 0.1807, "step": 7191 }, { "epoch": 0.9838577291381669, - "grad_norm": 1.4082940630087195, + "grad_norm": 1.4003812901424657, "learning_rate": 6.42800103428598e-09, - "loss": 0.1887, + "loss": 0.1877, "step": 7192 }, { "epoch": 0.9839945280437756, - "grad_norm": 1.2663534534092524, + "grad_norm": 1.2426770318075935, "learning_rate": 6.319536372095259e-09, - "loss": 0.1884, + "loss": 0.1896, "step": 7193 }, { "epoch": 0.9841313269493844, - "grad_norm": 1.29223637966859, + "grad_norm": 1.2935572516225637, "learning_rate": 6.211994038629044e-09, - "loss": 0.161, + "loss": 0.162, "step": 7194 }, { "epoch": 0.9842681258549931, - "grad_norm": 1.355049443922339, + "grad_norm": 1.3435911426135698, "learning_rate": 6.105374053749224e-09, - "loss": 0.1831, + "loss": 0.1832, "step": 7195 }, { "epoch": 0.9844049247606019, - "grad_norm": 1.151726568688763, + "grad_norm": 1.1501217992656267, "learning_rate": 5.999676437148938e-09, - "loss": 0.1625, + "loss": 0.1624, "step": 7196 }, { "epoch": 0.9845417236662106, - "grad_norm": 1.4049407980791109, + "grad_norm": 1.4080225186337199, "learning_rate": 5.894901208350345e-09, - "loss": 0.1915, + "loss": 0.1911, "step": 7197 }, { "epoch": 0.9846785225718194, - "grad_norm": 1.023070566003797, + "grad_norm": 1.0036542721086188, "learning_rate": 5.791048386705189e-09, - "loss": 0.1803, + "loss": 0.1807, "step": 7198 }, { "epoch": 0.9848153214774282, - "grad_norm": 1.2798091738477206, + "grad_norm": 1.2680625021187102, "learning_rate": 5.688117991395903e-09, - "loss": 0.1681, + "loss": 0.1692, "step": 7199 }, { "epoch": 0.9849521203830369, - "grad_norm": 1.617772130053793, + "grad_norm": 1.9260300820372442, "learning_rate": 5.58611004143228e-09, - "loss": 0.174, + "loss": 0.1811, "step": 7200 }, { "epoch": 0.9849521203830369, - "eval_loss": 0.17149388790130615, - "eval_runtime": 5.9124, - "eval_samples_per_second": 5.074, + "eval_loss": 0.1717732846736908, + "eval_runtime": 5.9131, + "eval_samples_per_second": 5.073, "eval_steps_per_second": 1.353, "step": 7200 }, { "epoch": 0.9850889192886457, - "grad_norm": 1.2070335074171783, + "grad_norm": 1.2022542579884106, "learning_rate": 5.485024555655916e-09, - "loss": 0.1735, + "loss": 0.1754, "step": 7201 }, { "epoch": 0.9852257181942544, - "grad_norm": 1.1241009213959123, + "grad_norm": 1.1308499063539337, "learning_rate": 5.384861552737431e-09, - "loss": 0.1408, + "loss": 0.141, "step": 7202 }, { "epoch": 0.9853625170998632, - "grad_norm": 1.3553379938567263, + "grad_norm": 1.3600311809435448, "learning_rate": 5.28562105117647e-09, - "loss": 0.1858, + "loss": 0.1852, "step": 7203 }, { "epoch": 0.9854993160054719, - "grad_norm": 1.2561394666028016, + "grad_norm": 1.2389856926501133, "learning_rate": 5.187303069302818e-09, - "loss": 0.1742, + "loss": 0.1743, "step": 7204 }, { "epoch": 0.9856361149110807, - "grad_norm": 1.225628051555659, + "grad_norm": 1.2098192777828825, "learning_rate": 5.089907625275281e-09, - "loss": 0.158, + "loss": 0.1563, "step": 7205 }, { "epoch": 0.9857729138166894, - "grad_norm": 1.2563902893346826, + "grad_norm": 1.2479665886291238, "learning_rate": 4.993434737083358e-09, - "loss": 0.1672, + "loss": 0.168, "step": 7206 }, { "epoch": 0.9859097127222982, - "grad_norm": 1.390588083384336, + "grad_norm": 1.3848123432994093, "learning_rate": 4.897884422545018e-09, - "loss": 0.1887, + "loss": 0.1896, "step": 7207 }, { "epoch": 0.986046511627907, - "grad_norm": 0.8943106412389135, + "grad_norm": 0.9011316631356945, "learning_rate": 4.803256699308923e-09, - "loss": 0.1139, + "loss": 0.1146, "step": 7208 }, { "epoch": 0.9861833105335157, - "grad_norm": 1.3600203704285267, + "grad_norm": 1.3361771493162125, "learning_rate": 4.709551584851646e-09, - "loss": 0.1863, + "loss": 0.1861, "step": 7209 }, { "epoch": 0.9863201094391245, - "grad_norm": 1.2507163925647877, + "grad_norm": 1.2585301753675522, "learning_rate": 4.616769096481566e-09, - "loss": 0.1627, + "loss": 0.1637, "step": 7210 }, { "epoch": 0.9864569083447332, - "grad_norm": 1.240671452548637, + "grad_norm": 1.22651728583316, "learning_rate": 4.5249092513355294e-09, - "loss": 0.1483, + "loss": 0.147, "step": 7211 }, { "epoch": 0.986593707250342, - "grad_norm": 1.5280834352394457, + "grad_norm": 1.521268653278434, "learning_rate": 4.433972066378856e-09, - "loss": 0.22, + "loss": 0.2198, "step": 7212 }, { "epoch": 0.9867305061559507, - "grad_norm": 1.277337444997391, + "grad_norm": 1.2583408533973817, "learning_rate": 4.343957558408663e-09, - "loss": 0.1715, + "loss": 0.1709, "step": 7213 }, { "epoch": 0.9868673050615595, - "grad_norm": 1.4051815176603462, + "grad_norm": 1.4049144917774803, "learning_rate": 4.254865744049985e-09, - "loss": 0.2058, + "loss": 0.2071, "step": 7214 }, { "epoch": 0.9870041039671683, - "grad_norm": 1.3718311439175546, + "grad_norm": 1.3515086549527056, "learning_rate": 4.166696639757995e-09, - "loss": 0.1892, + "loss": 0.1877, "step": 7215 }, { "epoch": 0.987140902872777, - "grad_norm": 1.5333578304643398, + "grad_norm": 1.5270026432931152, "learning_rate": 4.0794502618179965e-09, - "loss": 0.215, + "loss": 0.2121, "step": 7216 }, { "epoch": 0.9872777017783858, - "grad_norm": 1.2363992502737795, + "grad_norm": 1.2336521621835355, "learning_rate": 3.993126626343768e-09, - "loss": 0.1425, + "loss": 0.1439, "step": 7217 }, { "epoch": 0.9874145006839945, - "grad_norm": 1.2197290951616724, + "grad_norm": 1.2247884757954663, "learning_rate": 3.907725749279778e-09, - "loss": 0.1711, + "loss": 0.1705, "step": 7218 }, { "epoch": 0.9875512995896033, - "grad_norm": 1.417772914889843, + "grad_norm": 1.4061905285002245, "learning_rate": 3.823247646398964e-09, - "loss": 0.1948, + "loss": 0.1954, "step": 7219 }, { "epoch": 0.987688098495212, - "grad_norm": 1.2130603108970868, + "grad_norm": 1.1881028879141284, "learning_rate": 3.739692333304401e-09, "loss": 0.1563, "step": 7220 }, { "epoch": 0.9878248974008208, - "grad_norm": 1.1524911261483288, + "grad_norm": 1.1444460037919448, "learning_rate": 3.6570598254287436e-09, - "loss": 0.1553, + "loss": 0.1549, "step": 7221 }, { "epoch": 0.9879616963064295, - "grad_norm": 1.2751569546959705, + "grad_norm": 1.2795364833942322, "learning_rate": 3.5753501380342283e-09, - "loss": 0.1634, + "loss": 0.164, "step": 7222 }, { "epoch": 0.9880984952120383, - "grad_norm": 1.2472523795286596, + "grad_norm": 1.2392232160358856, "learning_rate": 3.4945632862132262e-09, - "loss": 0.1826, + "loss": 0.1839, "step": 7223 }, { "epoch": 0.9882352941176471, - "grad_norm": 1.2941715651065735, + "grad_norm": 1.3014298210283246, "learning_rate": 3.41469928488547e-09, - "loss": 0.2077, + "loss": 0.2115, "step": 7224 }, { "epoch": 0.9883720930232558, - "grad_norm": 1.0266982994036087, + "grad_norm": 1.0300518172099962, "learning_rate": 3.3357581488030476e-09, - "loss": 0.1417, + "loss": 0.1428, "step": 7225 }, { "epoch": 0.9885088919288646, - "grad_norm": 1.2822158235162382, + "grad_norm": 1.2686684859026571, "learning_rate": 3.2577398925459636e-09, - "loss": 0.1781, + "loss": 0.1802, "step": 7226 }, { "epoch": 0.9886456908344733, - "grad_norm": 1.177066821137954, + "grad_norm": 1.1627589617356695, "learning_rate": 3.1806445305243572e-09, - "loss": 0.1833, + "loss": 0.1817, "step": 7227 }, { "epoch": 0.9887824897400821, - "grad_norm": 1.4498819548117043, + "grad_norm": 1.443100458080549, "learning_rate": 3.104472076976839e-09, - "loss": 0.1865, + "loss": 0.1855, "step": 7228 }, { "epoch": 0.9889192886456908, - "grad_norm": 1.6334941277222623, + "grad_norm": 1.6180043479937656, "learning_rate": 3.029222545973265e-09, - "loss": 0.2538, + "loss": 0.2553, "step": 7229 }, { "epoch": 0.9890560875512996, - "grad_norm": 1.2270673794884521, + "grad_norm": 1.224117905869712, "learning_rate": 2.9548959514119623e-09, - "loss": 0.1806, + "loss": 0.1805, "step": 7230 }, { "epoch": 0.9891928864569084, - "grad_norm": 0.9913900296705174, + "grad_norm": 0.9838347802003344, "learning_rate": 2.881492307020839e-09, - "loss": 0.1303, + "loss": 0.1293, "step": 7231 }, { "epoch": 0.9893296853625171, - "grad_norm": 1.270072022399863, + "grad_norm": 1.267058247322319, "learning_rate": 2.809011626357383e-09, - "loss": 0.203, + "loss": 0.2037, "step": 7232 }, { "epoch": 0.9894664842681259, - "grad_norm": 1.1634050415781874, + "grad_norm": 1.1557609450989608, "learning_rate": 2.737453922809219e-09, - "loss": 0.1622, + "loss": 0.1634, "step": 7233 }, { "epoch": 0.9896032831737346, - "grad_norm": 1.2035072451020956, + "grad_norm": 1.1882367250084114, "learning_rate": 2.6668192095924416e-09, - "loss": 0.1586, + "loss": 0.1585, "step": 7234 }, { "epoch": 0.9897400820793434, - "grad_norm": 1.5189978291123627, + "grad_norm": 1.523991439050714, "learning_rate": 2.5971074997532818e-09, - "loss": 0.2268, + "loss": 0.2276, "step": 7235 }, { "epoch": 0.9898768809849521, - "grad_norm": 1.1087406468453618, + "grad_norm": 1.1027783780504026, "learning_rate": 2.528318806168106e-09, - "loss": 0.1442, + "loss": 0.1427, "step": 7236 }, { "epoch": 0.9900136798905609, - "grad_norm": 1.0218600763254597, + "grad_norm": 1.0172124451935878, "learning_rate": 2.4604531415411968e-09, - "loss": 0.1445, + "loss": 0.1451, "step": 7237 }, { "epoch": 0.9901504787961696, - "grad_norm": 1.2829392273442635, + "grad_norm": 1.2984016333486479, "learning_rate": 2.393510518408082e-09, - "loss": 0.143, + "loss": 0.1449, "step": 7238 }, { "epoch": 0.9902872777017784, - "grad_norm": 1.2501117551202188, + "grad_norm": 1.223062908305813, "learning_rate": 2.327490949132205e-09, - "loss": 0.1817, + "loss": 0.1798, "step": 7239 }, { "epoch": 0.9904240766073872, - "grad_norm": 1.3946588960796542, + "grad_norm": 1.3869885339055805, "learning_rate": 2.262394445908256e-09, - "loss": 0.2222, + "loss": 0.2225, "step": 7240 }, { "epoch": 0.9905608755129959, - "grad_norm": 1.6242851820027224, + "grad_norm": 1.6011968392291713, "learning_rate": 2.1982210207588397e-09, - "loss": 0.1709, + "loss": 0.1706, "step": 7241 }, { "epoch": 0.9906976744186047, - "grad_norm": 1.0072323166243728, + "grad_norm": 0.9906128497574997, "learning_rate": 2.134970685536697e-09, - "loss": 0.137, + "loss": 0.1364, "step": 7242 }, { "epoch": 0.9908344733242134, - "grad_norm": 1.2166494811537463, + "grad_norm": 1.2104433544147293, "learning_rate": 2.07264345192415e-09, - "loss": 0.1644, + "loss": 0.1661, "step": 7243 }, { "epoch": 0.9909712722298222, - "grad_norm": 1.5066906411709668, + "grad_norm": 1.4932903548745589, "learning_rate": 2.0112393314336565e-09, - "loss": 0.2285, + "loss": 0.2298, "step": 7244 }, { "epoch": 0.9911080711354309, - "grad_norm": 1.1560897717979661, + "grad_norm": 1.1550214936347025, "learning_rate": 1.950758335405589e-09, - "loss": 0.1592, + "loss": 0.1585, "step": 7245 }, { "epoch": 0.9912448700410397, - "grad_norm": 1.5172304012877322, + "grad_norm": 1.5073126760707132, "learning_rate": 1.8912004750115677e-09, - "loss": 0.2607, + "loss": 0.2604, "step": 7246 }, { "epoch": 0.9913816689466485, - "grad_norm": 1.333773474118837, + "grad_norm": 1.3284655177135378, "learning_rate": 1.8325657612511261e-09, - "loss": 0.1678, + "loss": 0.1677, "step": 7247 }, { "epoch": 0.9915184678522572, - "grad_norm": 1.4059539354233737, + "grad_norm": 1.3731929574509618, "learning_rate": 1.7748542049550454e-09, - "loss": 0.1648, + "loss": 0.1626, "step": 7248 }, { "epoch": 0.991655266757866, - "grad_norm": 1.1092435591289664, + "grad_norm": 1.1011495644015805, "learning_rate": 1.7180658167814667e-09, - "loss": 0.1412, + "loss": 0.1424, "step": 7249 }, { "epoch": 0.9917920656634747, - "grad_norm": 1.3738166276672281, + "grad_norm": 1.3556086141260888, "learning_rate": 1.6622006072197772e-09, - "loss": 0.1906, + "loss": 0.1894, "step": 7250 }, { "epoch": 0.9919288645690835, - "grad_norm": 1.4030636235225644, + "grad_norm": 1.4172725354696707, "learning_rate": 1.6072585865878343e-09, - "loss": 0.2082, + "loss": 0.2087, "step": 7251 }, { "epoch": 0.9920656634746922, - "grad_norm": 0.9917712655605139, + "grad_norm": 0.9753681123170418, "learning_rate": 1.5532397650341867e-09, - "loss": 0.1374, + "loss": 0.1367, "step": 7252 }, { "epoch": 0.992202462380301, - "grad_norm": 1.0208730519307023, + "grad_norm": 1.0231710884939427, "learning_rate": 1.5001441525352989e-09, - "loss": 0.1658, + "loss": 0.1662, "step": 7253 }, { "epoch": 0.9923392612859097, - "grad_norm": 1.2928036187875092, + "grad_norm": 1.293576376850003, "learning_rate": 1.4479717588977704e-09, - "loss": 0.2027, + "loss": 0.2031, "step": 7254 }, { "epoch": 0.9924760601915185, - "grad_norm": 1.2937945835317504, + "grad_norm": 1.2761012226059316, "learning_rate": 1.3967225937583374e-09, - "loss": 0.1581, + "loss": 0.1576, "step": 7255 }, { "epoch": 0.9926128590971273, - "grad_norm": 1.3457346500831897, + "grad_norm": 1.3486737136207276, "learning_rate": 1.346396666582761e-09, - "loss": 0.1907, + "loss": 0.192, "step": 7256 }, { "epoch": 0.992749658002736, - "grad_norm": 1.4578107093684862, + "grad_norm": 1.4588184265109605, "learning_rate": 1.2969939866658288e-09, - "loss": 0.2077, + "loss": 0.208, "step": 7257 }, { "epoch": 0.9928864569083448, - "grad_norm": 1.1762726459257709, + "grad_norm": 1.1792085385015212, "learning_rate": 1.2485145631324636e-09, - "loss": 0.1535, + "loss": 0.1528, "step": 7258 }, { "epoch": 0.9930232558139535, - "grad_norm": 1.4589276289432054, + "grad_norm": 1.4205670855187058, "learning_rate": 1.200958404936059e-09, - "loss": 0.2044, + "loss": 0.203, "step": 7259 }, { "epoch": 0.9931600547195623, - "grad_norm": 1.0793243732302935, + "grad_norm": 1.0913286192253517, "learning_rate": 1.1543255208612546e-09, - "loss": 0.1596, + "loss": 0.1593, "step": 7260 }, { "epoch": 0.993296853625171, - "grad_norm": 1.1531639807685032, + "grad_norm": 1.1570345591660014, "learning_rate": 1.1086159195206059e-09, - "loss": 0.1537, + "loss": 0.1555, "step": 7261 }, { "epoch": 0.9934336525307798, - "grad_norm": 1.102644822824773, + "grad_norm": 1.0951919578876794, "learning_rate": 1.0638296093562483e-09, - "loss": 0.1433, + "loss": 0.1434, "step": 7262 }, { "epoch": 0.9935704514363886, - "grad_norm": 1.3599703827342555, + "grad_norm": 1.3518437505837333, "learning_rate": 1.019966598640454e-09, - "loss": 0.1964, + "loss": 0.1975, "step": 7263 }, { "epoch": 0.9937072503419973, - "grad_norm": 1.19735420928178, + "grad_norm": 1.1742355630406696, "learning_rate": 9.7702689547563e-10, - "loss": 0.1703, + "loss": 0.1709, "step": 7264 }, { "epoch": 0.993844049247606, - "grad_norm": 1.5336510483407217, + "grad_norm": 1.5136080471452948, "learning_rate": 9.350105077909898e-10, - "loss": 0.2026, + "loss": 0.2015, "step": 7265 }, { "epoch": 0.9939808481532147, - "grad_norm": 1.4101732684552417, + "grad_norm": 1.4156792422900069, "learning_rate": 8.939174433481024e-10, - "loss": 0.2124, + "loss": 0.2136, "step": 7266 }, { "epoch": 0.9941176470588236, - "grad_norm": 1.2307368709653692, + "grad_norm": 1.215052164670617, "learning_rate": 8.537477097364522e-10, - "loss": 0.1436, + "loss": 0.143, "step": 7267 }, { "epoch": 0.9942544459644322, - "grad_norm": 1.3409796130598328, + "grad_norm": 1.319538976512827, "learning_rate": 8.145013143756597e-10, - "loss": 0.1899, + "loss": 0.1887, "step": 7268 }, { "epoch": 0.994391244870041, - "grad_norm": 1.2371891997278084, + "grad_norm": 1.237595700510941, "learning_rate": 7.76178264513816e-10, - "loss": 0.1797, + "loss": 0.1829, "step": 7269 }, { "epoch": 0.9945280437756497, - "grad_norm": 1.4220165134359626, + "grad_norm": 1.432981765799945, "learning_rate": 7.387785672302584e-10, - "loss": 0.181, + "loss": 0.1818, "step": 7270 }, { "epoch": 0.9946648426812585, - "grad_norm": 1.3525551457187661, + "grad_norm": 1.3344961350989302, "learning_rate": 7.02302229431684e-10, - "loss": 0.179, + "loss": 0.1783, "step": 7271 }, { "epoch": 0.9948016415868673, - "grad_norm": 1.4331053159410392, + "grad_norm": 1.4291135653079938, "learning_rate": 6.667492578554812e-10, - "loss": 0.2062, + "loss": 0.2076, "step": 7272 }, { "epoch": 0.994938440492476, - "grad_norm": 1.5300266981131472, + "grad_norm": 1.5162055829668468, "learning_rate": 6.321196590680645e-10, "loss": 0.247, "step": 7273 }, { "epoch": 0.9950752393980848, - "grad_norm": 1.3572573680044624, + "grad_norm": 1.3467091728116556, "learning_rate": 5.984134394665386e-10, - "loss": 0.1744, + "loss": 0.1753, "step": 7274 }, { "epoch": 0.9952120383036935, - "grad_norm": 1.0961129002442924, + "grad_norm": 1.0763404327572457, "learning_rate": 5.656306052753691e-10, - "loss": 0.1689, + "loss": 0.1696, "step": 7275 }, { "epoch": 0.9953488372093023, - "grad_norm": 1.4758579926641466, + "grad_norm": 1.4605543033825492, "learning_rate": 5.337711625497122e-10, - "loss": 0.1946, + "loss": 0.195, "step": 7276 }, { "epoch": 0.995485636114911, - "grad_norm": 1.3698961267702385, + "grad_norm": 1.3728723785435566, "learning_rate": 5.0283511717375e-10, - "loss": 0.1466, + "loss": 0.1491, "step": 7277 }, { "epoch": 0.9956224350205198, - "grad_norm": 1.2603972348015964, + "grad_norm": 1.2411559735919575, "learning_rate": 4.728224748623556e-10, - "loss": 0.1763, + "loss": 0.1772, "step": 7278 }, { "epoch": 0.9957592339261286, - "grad_norm": 1.58851858256477, + "grad_norm": 1.5854566227962625, "learning_rate": 4.437332411577622e-10, - "loss": 0.2279, + "loss": 0.2293, "step": 7279 }, { "epoch": 0.9958960328317373, - "grad_norm": 1.4174407190562661, + "grad_norm": 1.3948857003964021, "learning_rate": 4.155674214328942e-10, - "loss": 0.2134, + "loss": 0.2142, "step": 7280 }, { "epoch": 0.9960328317373461, - "grad_norm": 1.1607935933346254, + "grad_norm": 1.1560545626095755, "learning_rate": 3.8832502089081183e-10, - "loss": 0.1701, + "loss": 0.1717, "step": 7281 }, { "epoch": 0.9961696306429548, - "grad_norm": 1.4896187025174354, + "grad_norm": 1.4909303414356507, "learning_rate": 3.620060445619356e-10, - "loss": 0.2082, + "loss": 0.2096, "step": 7282 }, { "epoch": 0.9963064295485636, - "grad_norm": 1.2496958486670375, + "grad_norm": 1.2735593699850132, "learning_rate": 3.3661049730848715e-10, - "loss": 0.1731, + "loss": 0.1737, "step": 7283 }, { "epoch": 0.9964432284541723, - "grad_norm": 1.1837254430571986, + "grad_norm": 1.1792478738360057, "learning_rate": 3.121383838200487e-10, - "loss": 0.1552, + "loss": 0.1544, "step": 7284 }, { "epoch": 0.9965800273597811, - "grad_norm": 1.0784143148394534, + "grad_norm": 1.0873799126054173, "learning_rate": 2.885897086174483e-10, - "loss": 0.1555, + "loss": 0.1574, "step": 7285 }, { "epoch": 0.9967168262653898, - "grad_norm": 1.4841387645048205, + "grad_norm": 1.4482227272527486, "learning_rate": 2.659644760494295e-10, - "loss": 0.2008, + "loss": 0.2042, "step": 7286 }, { "epoch": 0.9968536251709986, - "grad_norm": 1.413148567809354, + "grad_norm": 1.4146404827430927, "learning_rate": 2.4426269029487193e-10, - "loss": 0.2052, + "loss": 0.2059, "step": 7287 }, { "epoch": 0.9969904240766074, - "grad_norm": 1.2123619278701752, + "grad_norm": 1.1946986848329213, "learning_rate": 2.234843553627908e-10, - "loss": 0.1493, + "loss": 0.1481, "step": 7288 }, { "epoch": 0.9971272229822161, - "grad_norm": 1.3857391538865074, + "grad_norm": 1.3917375815371975, "learning_rate": 2.0362947509011687e-10, - "loss": 0.1807, + "loss": 0.1815, "step": 7289 }, { "epoch": 0.9972640218878249, - "grad_norm": 1.4740574789670127, + "grad_norm": 1.4647494244039454, "learning_rate": 1.8469805314447198e-10, - "loss": 0.1974, + "loss": 0.1953, "step": 7290 }, { "epoch": 0.9974008207934336, - "grad_norm": 1.4168852617410355, + "grad_norm": 1.4064996278522741, "learning_rate": 1.666900930225035e-10, - "loss": 0.182, + "loss": 0.1827, "step": 7291 }, { "epoch": 0.9975376196990424, - "grad_norm": 1.1767446400053483, + "grad_norm": 1.175192974383436, "learning_rate": 1.4960559804988452e-10, - "loss": 0.1871, + "loss": 0.1873, "step": 7292 }, { "epoch": 0.9976744186046511, - "grad_norm": 1.3670122776851532, + "grad_norm": 1.3676854846001127, "learning_rate": 1.3344457138297906e-10, - "loss": 0.1864, + "loss": 0.1861, "step": 7293 }, { "epoch": 0.9978112175102599, - "grad_norm": 1.4814079897578232, + "grad_norm": 1.4709768258094975, "learning_rate": 1.1820701600551154e-10, - "loss": 0.194, + "loss": 0.1923, "step": 7294 }, { "epoch": 0.9979480164158687, - "grad_norm": 1.5930812262043526, + "grad_norm": 1.5864793816943323, "learning_rate": 1.0389293473245243e-10, - "loss": 0.1906, + "loss": 0.1901, "step": 7295 }, { "epoch": 0.9980848153214774, - "grad_norm": 1.195440981311069, + "grad_norm": 1.1764894109083959, "learning_rate": 9.050233020779786e-11, - "loss": 0.1823, + "loss": 0.1832, "step": 7296 }, { "epoch": 0.9982216142270862, - "grad_norm": 1.1808763422150725, + "grad_norm": 1.1773931893162273, "learning_rate": 7.803520490456962e-11, - "loss": 0.1709, + "loss": 0.1712, "step": 7297 }, { "epoch": 0.9983584131326949, - "grad_norm": 1.2559182507463829, + "grad_norm": 1.2446861374035594, "learning_rate": 6.649156112537026e-11, - "loss": 0.1969, + "loss": 0.1972, "step": 7298 }, { "epoch": 0.9984952120383037, - "grad_norm": 1.3679341778381653, + "grad_norm": 1.3499304250097517, "learning_rate": 5.5871401001827976e-11, - "loss": 0.2024, + "loss": 0.2003, "step": 7299 }, { "epoch": 0.9986320109439124, - "grad_norm": 1.4855667867137123, + "grad_norm": 1.46258904075965, "learning_rate": 4.617472649681709e-11, - "loss": 0.2098, + "loss": 0.2091, "step": 7300 }, { "epoch": 0.9986320109439124, - "eval_loss": 0.17163625359535217, - "eval_runtime": 5.9394, - "eval_samples_per_second": 5.051, - "eval_steps_per_second": 1.347, + "eval_loss": 0.17182795703411102, + "eval_runtime": 5.9247, + "eval_samples_per_second": 5.064, + "eval_steps_per_second": 1.35, "step": 7300 }, { "epoch": 0.9987688098495212, - "grad_norm": 1.4066730040762478, + "grad_norm": 1.415824153894421, "learning_rate": 3.740153940001712e-11, - "loss": 0.2097, + "loss": 0.2086, "step": 7301 }, { "epoch": 0.9989056087551299, - "grad_norm": 1.4670034894348902, + "grad_norm": 1.4346405892360374, "learning_rate": 2.955184133235367e-11, - "loss": 0.1943, + "loss": 0.1922, "step": 7302 }, { "epoch": 0.9990424076607387, - "grad_norm": 1.3998866923418567, + "grad_norm": 1.4037313827146394, "learning_rate": 2.262563374377802e-11, - "loss": 0.1964, + "loss": 0.1977, "step": 7303 }, { "epoch": 0.9991792065663475, - "grad_norm": 1.2729984169453217, + "grad_norm": 1.262778084487748, "learning_rate": 1.662291791326709e-11, - "loss": 0.1709, + "loss": 0.1718, "step": 7304 }, { "epoch": 0.9993160054719562, - "grad_norm": 1.0850414223507194, + "grad_norm": 1.0623323252614751, "learning_rate": 1.1543694949933682e-11, - "loss": 0.1653, + "loss": 0.165, "step": 7305 }, { "epoch": 0.999452804377565, - "grad_norm": 1.3759730660081568, + "grad_norm": 1.3656213906531192, "learning_rate": 7.38796579136114e-12, - "loss": 0.1843, + "loss": 0.183, "step": 7306 }, { "epoch": 0.9995896032831737, - "grad_norm": 1.161894118010581, + "grad_norm": 1.147503041698089, "learning_rate": 4.155731205268687e-12, "loss": 0.1608, "step": 7307 }, { "epoch": 0.9997264021887825, - "grad_norm": 1.4713571289777072, + "grad_norm": 1.4772165052429151, "learning_rate": 1.8469917889563094e-12, - "loss": 0.2106, + "loss": 0.2085, "step": 7308 }, { "epoch": 0.9998632010943912, - "grad_norm": 1.1698254032171942, + "grad_norm": 1.1747519950274854, "learning_rate": 4.617479687496485e-13, - "loss": 0.1734, + "loss": 0.174, "step": 7309 }, { "epoch": 1.0, - "grad_norm": 1.4190447945939009, + "grad_norm": 1.4064839068780588, "learning_rate": 0.0, - "loss": 0.1963, + "loss": 0.1957, "step": 7310 }, { "epoch": 1.0, "step": 7310, "total_flos": 176832585990144.0, - "train_loss": 0.1979428825846449, - "train_runtime": 18695.1866, - "train_samples_per_second": 1.564, - "train_steps_per_second": 0.391 + "train_loss": 0.19812894271515952, + "train_runtime": 18578.2586, + "train_samples_per_second": 1.574, + "train_steps_per_second": 0.393 } ], "logging_steps": 1,